diff --git a/.cirrus.yml b/.cirrus.yml
new file mode 100644
index 000000000..9a898f421
--- /dev/null
+++ b/.cirrus.yml
@@ -0,0 +1,177 @@
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+
+#task:
+#  name: AppleM1/LLVM
+#  compile_script:
+#  - brew install llvm
+#  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+#  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+#  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+#  - make TARGET=VORTEX USE_OPENMP=1 CC=clang
+
+#task:
+#  name: AppleM1/LLVM/ILP64
+#  compile_script:
+#  - brew install llvm
+#  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+#  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+#  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+#  - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
+
+#task:
+#  name: AppleM1/LLVM/CMAKE
+#  compile_script:
+#  - brew install llvm
+#  - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
+#  - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+#  - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+#  - mkdir build
+#  - cd build
+#  - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
+#  - make -j 4
+
+#task:
+#  name: AppleM1/GCC/MAKE/OPENMP
+#  compile_script:
+#  - brew install gcc@11
+#  - export PATH=/opt/homebrew/bin:$PATH
+#  - export LDFLAGS="-L/opt/homebrew/lib"
+#  - export CPPFLAGS="-I/opt/homebrew/include"
+#  - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
+  
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+task:
+  name: AppleM1/LLVM x86_64 xbuild
+  compile_script:
+  - #brew install llvm
+  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
+  - export ARCHS="i386 x86_64"
+  - export ARCHS_STANDARD="i386 x86_64"
+  - export ARCHS_STANDARD_32_64_BIT="i386 x86_64"
+  - export ARCHS_STANDARD_64_BIT=x86_64
+  - export ARCHS_STANDARD_INCLUDING_64_BIT="i386 x86_64"
+  - export ARCHS_UNIVERSAL_IPHONE_OS="i386 x86_64"
+  - export VALID_ARCHS="i386 x86_64"
+  - xcrun --sdk macosx --show-sdk-path
+  - xcodebuild -version
+  - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX12.3.sdk -arch x86_64"
+  - make TARGET=CORE2 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
+  always:
+    config_artifacts:
+      path: "*conf*"
+      type: text/plain
+#  lib_artifacts:
+#    path: "libopenblas*"
+#    type: application/octet-streamm
+
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+task:
+  name: AppleM1/LLVM armv8-ios xbuild
+  compile_script:
+  - #brew install llvm
+  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
+  - export CC=/Applications/Xcode-14.0.0.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-14.0.0.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
+  - make TARGET=ARMV8 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 CROSS=1
+  always:
+    config_artifacts:
+      path: "*conf*"
+      type: text/plain
+
+macos_instance:
+  image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
+task:
+  name: AppleM1/LLVM armv7-androidndk xbuild
+  compile_script:
+  - #brew install android-ndk
+  - export #PATH=/opt/homebrew/opt/llvm/bin:$PATH
+  - export #LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+  - export #CPPFLAGS="-I/opt/homebrew/opt/llvm/include" 
+  - find /System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b -name "armv7a-linux-androideabi*-ranlib"
+  - #export CC=/Applications/Xcode-13.4.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+  - #export CFLAGS="-O2 -unwindlib=none -Wno-macro-redefined -isysroot /Applications/Xcode-13.4.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS16.0.sdk -arch arm64 -miphoneos-version-min=10.0"
+  - export CC=/System/Volumes/Data/opt/homebrew/Caskroom/android-ndk/25b/AndroidNDK8937393.app/Contents/NDK/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi23-clang
+  - make TARGET=ARMV7 ARM_SOFTFP_ABI=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 RANLIB="ls -l"
+  always:
+    config_artifacts:
+      path: "*conf*"
+      type: text/plain
+
+task:
+  name: NeoverseN1
+  arm_container:
+    image: node:latest
+  compile_script:
+  - make
+
+  task:
+  name: NeoverseN1-ILP64
+  arm_container:
+    image: node:latest
+  compile_script:
+  - make INTERFACE64=1
+
+task:
+  name: NeoverseN1-OMP
+  arm_container:
+    image: node:latest
+    cpu: 8
+  compile_script:
+  - make USE_OPENMP=1
+
+FreeBSD_task:
+  name: FreeBSD-gcc12
+  freebsd_instance:
+    image_family: freebsd-13-2
+  install_script:
+  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
+  compile_script:
+  - ls -l /usr/local/lib
+  - gmake CC=gcc 
+
+
+FreeBSD_task:
+  name: freebsd-gcc12-ilp64
+  freebsd_instance:
+    image_family: freebsd-13-2
+  install_script:
+  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc
+  compile_script:
+  - ls -l /usr/local/lib
+  - gmake CC=gcc INTERFACE64=1
+
+FreeBSD_task:
+  name: FreeBSD-clang-openmp
+  freebsd_instance:
+    image_family: freebsd-13-2
+  install_script:
+  - pkg update -f && pkg upgrade -y && pkg install -y gmake gcc 
+  - ln -s /usr/local/lib/gcc12/libgfortran.so.5.0.0 /usr/lib/libgfortran.so
+  compile_script:
+  - gmake CC=clang FC=gfortran USE_OPENMP=1 CPP_THREAD_SAFETY_TEST=1
+
+#task:
+#  name: Windows/LLVM16     --- too slow ---
+#  windows_container:
+#    image: cirrusci/windowsservercore:cmake-2021.12.07
+#  install_script: 
+#    - choco list --localonly
+#    - choco install -y llvm
+#    - # choco install -y cmake --installargs '"ADD_CMAKE_TO_PATH=System"'
+#    - choco install -y ninja
+#    - refreshenv
+#    - cd "c:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/VC/Auxiliary/Build"
+#    - vcvarsall x64
+#    - cd "C:\Users\ContainerAdministrator\AppData\Local\Temp\cirrus-ci-build"
+#    - cmake -S . -B build -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release
+#    - cd build
+#    - cmake --build .
+#    - ctest
diff --git a/.cirun.yml b/.cirun.yml
new file mode 100644
index 000000000..bfc6494d0
--- /dev/null
+++ b/.cirun.yml
@@ -0,0 +1,16 @@
+# Self-Hosted Github Action Runners on AWS via Cirun.io
+# Reference: https://docs.cirun.io/reference/yaml
+runners:
+  - name: "aws-runner-graviton"
+    # Cloud Provider: AWS
+    cloud: "aws"
+    region: "us-east-1"
+    # Cheapest VM on AWS
+    instance_type: "c7g.large"
+    # Ubuntu-22.04, ami image
+    machine_image: "ami-0a0c8eebcdd6dcbd0"
+    preemptible: false
+    # Add this label in the "runs-on" param in .github/workflows/<workflow-name>.yml
+    # So that this runner is created for running the workflow
+    labels:
+      - "cirun-aws-runner-graviton"
diff --git a/.github/workflows/apple_m.yml b/.github/workflows/apple_m.yml
new file mode 100644
index 000000000..e34eada86
--- /dev/null
+++ b/.github/workflows/apple_m.yml
@@ -0,0 +1,149 @@
+name: apple m
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: macos-14
+
+    strategy:
+      fail-fast: false
+      matrix:
+        build: [cmake, make]
+        fortran: [gfortran]
+        openmp: [0, 1]
+        ilp64: [0, 1]
+        
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Print system information
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            cat /proc/cpuinfo
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            sysctl -a | grep machdep.cpu
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Install Dependencies
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt-get install -y gfortran cmake ccache libtinfo5
+          elif [ "$RUNNER_OS" == "macOS" ]; then
+            # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
+            brew reinstall gcc
+            brew install coreutils cmake ccache
+            brew install llvm
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          # GNU make and cmake call the compilers differently. It looks like
+          # that causes the cache to mismatch. Keep the ccache for both build
+          # tools separate to avoid polluting each other.
+          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
+          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
+            ccache-${{ runner.os }}-${{ matrix.build }}
+
+      - name: Configure ccache
+        run: |
+          if [ "${{ matrix.build }}" = "make" ]; then
+            # Add ccache to path
+            if [ "$RUNNER_OS" = "Linux" ]; then
+              echo "/usr/lib/ccache" >> $GITHUB_PATH
+            elif [ "$RUNNER_OS" = "macOS" ]; then
+              echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
+              echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
+              echo "" >>$GITHUB_PATH
+            else
+              echo "::error::$RUNNER_OS not supported"
+              exit 1
+            fi
+          fi
+          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Build OpenBLAS
+        run: |
+          export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
+          export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
+          export CC="/opt/homebrew/opt/llvm/bin/clang"
+          case "${{ matrix.build }}" in
+            "make")
+              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
+              ;;
+            "cmake")
+              export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
+              mkdir build && cd build
+              cmake -DDYNAMIC_ARCH=1 \
+                    -DUSE_OPENMP=${{matrix.openmp}} \
+                    -DINTERFACE64=${{matrix.ilp64}} \
+                    -DNOFORTRAN=0 \
+                    -DBUILD_WITHOUT_LAPACK=0 \
+                    -DCMAKE_VERBOSE_MAKEFILE=ON \
+                    -DCMAKE_BUILD_TYPE=Release \
+                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
+                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
+                    ..
+              cmake --build .
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac
+
+      - name: Show ccache status
+        continue-on-error: true
+        run: ccache -s
+
+      - name: Run tests
+        timeout-minutes: 60
+        run: |
+          case "${{ matrix.build }}" in
+            "make")
+              MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
+              echo "::group::Tests in 'test' directory"
+              make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              echo "::group::Tests in 'ctest' directory"
+              make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              echo "::group::Tests in 'utest' directory"
+              make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              ;;
+            "cmake")
+              cd build && ctest
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac
diff --git a/.github/workflows/arm64_graviton.yml b/.github/workflows/arm64_graviton.yml
new file mode 100644
index 000000000..6928312b5
--- /dev/null
+++ b/.github/workflows/arm64_graviton.yml
@@ -0,0 +1,139 @@
+name: arm64 graviton cirun
+
+on:
+  push:
+    branches:
+      - develop
+      - release-**
+  pull_request:
+    branches:
+      - develop
+      - release-**
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: "cirun-aws-runner-graviton--${{ github.run_id }}"
+
+    strategy:
+      fail-fast: false
+      matrix:
+        fortran: [gfortran]
+        build: [cmake, make]
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Print system information
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            cat /proc/cpuinfo
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Install Dependencies
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt update
+            sudo apt-get install -y gfortran cmake ccache libtinfo5
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          # GNU make and cmake call the compilers differently. It looks like
+          # that causes the cache to mismatch. Keep the ccache for both build
+          # tools separate to avoid polluting each other.
+          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
+          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
+            ccache-${{ runner.os }}-${{ matrix.build }}
+
+      - name: Configure ccache
+        run: |
+          if [ "${{ matrix.build }}" = "make" ]; then
+            # Add ccache to path
+            if [ "$RUNNER_OS" = "Linux" ]; then
+              echo "/usr/lib/ccache" >> $GITHUB_PATH
+            else
+              echo "::error::$RUNNER_OS not supported"
+              exit 1
+            fi
+          fi
+          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Build OpenBLAS
+        run: |
+          case "${{ matrix.build }}" in
+            "make")
+              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
+              ;;
+            "cmake")
+              mkdir build && cd build
+              cmake -DDYNAMIC_ARCH=1 \
+                    -DNOFORTRAN=0 \
+                    -DBUILD_WITHOUT_LAPACK=0 \
+                    -DCMAKE_VERBOSE_MAKEFILE=ON \
+                    -DCMAKE_BUILD_TYPE=Release \
+                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
+                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
+                    ..
+              cmake --build .
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac
+
+      - name: Show ccache status
+        continue-on-error: true
+        run: ccache -s
+
+      - name: Run tests
+        timeout-minutes: 60
+        run: |
+          case "${{ matrix.build }}" in
+            "make")
+              MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
+              echo "::group::Tests in 'test' directory"
+              make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              echo "::group::Tests in 'ctest' directory"
+              make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              echo "::group::Tests in 'utest' directory"
+              make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
+              echo "::endgroup::"
+              ;;
+            "cmake")
+              cd build && ctest
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac
diff --git a/.github/workflows/c910v.yml b/.github/workflows/c910v.yml
new file mode 100644
index 000000000..68ba2ddd7
--- /dev/null
+++ b/.github/workflows/c910v.yml
@@ -0,0 +1,126 @@
+name: c910v qemu test
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: ubuntu-latest
+    env:
+      xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
+      toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: RISCV64_GENERIC
+            triple: riscv64-linux-gnu
+            apt_triple: riscv64-linux-gnu
+            opts: NO_SHARED=1 TARGET=RISCV64_GENERIC
+          - target: C910V
+            triple: riscv64-unknown-linux-gnu
+            apt_triple: riscv64-linux-gnu
+            opts: NO_SHARED=1 TARGET=C910V
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: install build deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
+          gcc-${{ matrix.apt_triple }} gfortran-${{ matrix.apt_triple }} libgomp1-riscv64-cross
+
+      - name: checkout qemu
+        uses: actions/checkout@v3
+        with:
+          repository: T-head-Semi/qemu
+          path: qemu
+          ref: 1e692ebb43d396c52352406323fc782c1ac99a42
+
+      - name: build qemu
+        run: |
+          # Force use c910v qemu-user
+          wget https://github.com/revyos/qemu/commit/5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
+          cd qemu
+          patch -p1 < ../5164bca5a4bcde4534dc1a9aa3a7f619719874cf.patch
+          ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
+          make -j$(nproc)
+          make install
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: build OpenBLAS
+        run: |
+            wget ${xuetie_toolchain}/${toolchain_file_name}
+            tar -xvf ${toolchain_file_name} -C /opt
+            export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"
+
+            make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
+
+      - name: test
+        run: |
+          export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
+          qemu-riscv64 ./utest/openblas_utest
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat2 < ./ctest/sin2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat2 < ./ctest/din2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat2 < ./ctest/cin2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat2 < ./ctest/zin2
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xscblat3 < ./ctest/sin3
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xdcblat3 < ./ctest/din3
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xccblat3 < ./ctest/cin3
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./ctest/xzcblat3 < ./ctest/zin3
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat1
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat1
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-riscv64 ./test/zblat3 < ./test/zblat3.dat
diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml
index ff40b354d..669aa8116 100644
--- a/.github/workflows/dynamic_arch.yml
+++ b/.github/workflows/dynamic_arch.yml
@@ -2,11 +2,16 @@ name: continuous build
 
 on: [push, pull_request]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 permissions:
   contents: read # to fetch code (actions/checkout)
 
 jobs:
   build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: ${{ matrix.os }}
 
     strategy:
@@ -37,6 +42,7 @@ jobs:
       - name: Install Dependencies
         run: |
           if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt-get update
             sudo apt-get install -y gfortran cmake ccache libtinfo5
           elif [ "$RUNNER_OS" == "macOS" ]; then
             # It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
@@ -146,45 +152,59 @@ jobs:
 
 
   msys2:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: windows-latest
 
     strategy:
       fail-fast: false
       matrix:
-        msystem: [MINGW64, MINGW32, CLANG64]
+        msystem: [UCRT64, MINGW32, CLANG64, CLANG32]
         idx: [int32, int64]
         build-type: [Release]
         include:
-          - msystem: MINGW64
+          - msystem: UCRT64
             idx: int32
-            target-prefix: mingw-w64-x86_64
-            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            target-prefix: mingw-w64-ucrt-x86_64
+            fc-pkg: fc
           - msystem: MINGW32
             idx: int32
             target-prefix: mingw-w64-i686
-            fc-pkg: mingw-w64-i686-gcc-fortran
+            fc-pkg: fc
           - msystem: CLANG64
             idx: int32
             target-prefix: mingw-w64-clang-x86_64
+            fc-pkg: fc
+            # Compiling with Flang 16 seems to cause test errors on machines
+            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
+            no-avx512-flags: -DNO_AVX512=1
+          - msystem: CLANG32
+            idx: int32
+            target-prefix: mingw-w64-clang-i686
+            fc-pkg: cc
             c-lapack-flags: -DC_LAPACK=ON
-          - msystem: MINGW64
+          - msystem: UCRT64
             idx: int64
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
-            target-prefix: mingw-w64-x86_64
-            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            target-prefix: mingw-w64-ucrt-x86_64
+            fc-pkg: fc
           - msystem: CLANG64
             idx: int64
             idx64-flags: -DBINARY=64 -DINTERFACE64=1
             target-prefix: mingw-w64-clang-x86_64
-            c-lapack-flags: -DC_LAPACK=ON
-          - msystem: MINGW64
+            fc-pkg: fc
+            # Compiling with Flang 16 seems to cause test errors on machines
+            # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
+            no-avx512-flags: -DNO_AVX512=1
+          - msystem: UCRT64
             idx: int32
-            target-prefix: mingw-w64-x86_64
-            fc-pkg: mingw-w64-x86_64-gcc-fortran
+            target-prefix: mingw-w64-ucrt-x86_64
+            fc-pkg: fc
             build-type: None
         exclude:
           - msystem: MINGW32
             idx: int64
+          - msystem: CLANG32
+            idx: int64
 
     defaults:
       run:
@@ -209,7 +229,7 @@ jobs:
           install: >-
             base-devel
             ${{ matrix.target-prefix }}-cc
-            ${{ matrix.fc-pkg }}
+            ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
             ${{ matrix.target-prefix }}-cmake
             ${{ matrix.target-prefix }}-ninja
             ${{ matrix.target-prefix }}-ccache
@@ -217,14 +237,21 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v3
 
-      - name: Compilation cache
-        uses: actions/cache@v3
-        with:
-          # It looks like this path needs to be hard-coded.
-          path: C:/msys64/home/runneradmin/.ccache
+      - name: Prepare ccache
+        # Get cache location of ccache
+        # Create key that is used in action/cache/restore and action/cache/save steps
+        id: ccache-prepare
+        run: |
+          echo "ccachedir=$(cygpath -m $(ccache -k cache_dir))" >> $GITHUB_OUTPUT
           # We include the commit sha in the cache key, as new cache entries are
           # only created if there is no existing entry for the key yet.
-          key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
+          echo "key=ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}" >> $GITHUB_OUTPUT
+
+      - name: Restore ccache
+        uses: actions/cache/restore@v3
+        with:
+          path: ${{ steps.ccache-prepare.outputs.ccachedir }}
+          key: ${{ steps.ccache-prepare.outputs.key }}
           # Restore a matching ccache cache entry. Prefer same branch.
           restore-keys: |
             ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
@@ -234,9 +261,10 @@ jobs:
         # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
         run: |
           which ccache
-          test -d ~/.ccache || mkdir -p ~/.ccache
-          echo "max_size = 250M" > ~/.ccache/ccache.conf
-          echo "compression = true" >> ~/.ccache/ccache.conf
+          test -d ${{ steps.ccache-prepare.outputs.ccachedir }} || mkdir -p ${{ steps.ccache-prepare.outputs.ccachedir }}
+          echo "max_size = 250M" > ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
+          echo "compression = true" >> ${{ steps.ccache-prepare.outputs.ccachedir }}/ccache.conf
+          ccache -p
           ccache -s
           echo $HOME
           cygpath -w $HOME
@@ -253,6 +281,7 @@ jobs:
                 -DTARGET=CORE2 \
                 ${{ matrix.idx64-flags }} \
                 ${{ matrix.c-lapack-flags }} \
+                ${{ matrix.no-avx512-flags }} \
                 -DCMAKE_C_COMPILER_LAUNCHER=ccache \
                 -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
                 ..
@@ -264,12 +293,33 @@ jobs:
         continue-on-error: true
         run: ccache -s
 
+      - name: Save ccache
+        # Save the cache after we are done (successfully) building
+        uses: actions/cache/save@v3
+        with:
+          path: ${{ steps.ccache-prepare.outputs.ccachedir }}
+          key: ${{ steps.ccache-prepare.outputs.key }}
+
       - name: Run tests
+        id: run-ctest
         timeout-minutes: 60
         run: cd build && ctest
 
+      - name: Re-run tests
+        if: always() && (steps.run-ctest.outcome == 'failure')
+        timeout-minutes: 60
+        run: |
+          cd build
+          echo "::group::Re-run ctest"
+          ctest --rerun-failed --output-on-failure || true
+          echo "::endgroup::"
+          echo "::group::Log from these tests"
+          [ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
+          echo "::endgroup::"
+
 
   cross_build:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: ubuntu-22.04
 
     strategy:
@@ -295,6 +345,7 @@ jobs:
 
       - name: Install Dependencies
         run: |
+          sudo apt-get update
           sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
 
       - name: Compilation cache
diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml
new file mode 100644
index 000000000..f1bf8064c
--- /dev/null
+++ b/.github/workflows/loongarch64.yml
@@ -0,0 +1,119 @@
+name: loongarch64 qemu test
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: LOONGSONGENERIC
+            triple:  loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
+          - target: LOONGSON3R5
+            triple: loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
+          - target: LOONGSON2K1000
+            triple: loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
+          - target: DYNAMIC_ARCH
+            triple: loongarch64-unknown-linux-gnu
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Install APT deps
+        run: |
+          sudo add-apt-repository ppa:savoury1/virtualisation
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
+          qemu-user-static
+
+      - name: Download and install loongarch64-toolchain
+        run: |
+          wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
+          #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
+          tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt
+
+      - name: Set env
+        run: |
+          echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
+          echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Disable utest dsdot:dsdot_n_1
+        run: |
+          echo -n > utest/test_dsdot.c
+          echo "Due to the qemu versions 7.2 causing utest cases to fail,"
+          echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
+
+      - name: Build OpenBLAS
+        run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
+
+      - name: Test
+        run: |
+          qemu-loongarch64-static ./utest/openblas_utest
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT2.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
+          rm -f ./test/?BLAT3.SUMM
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
+          OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml
index de7c0c0f3..4686ba713 100644
--- a/.github/workflows/mips64.yml
+++ b/.github/workflows/mips64.yml
@@ -2,11 +2,16 @@ name: mips64 qemu test
 
 on: [push, pull_request]
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 permissions:
   contents: read # to fetch code (actions/checkout)
 
 jobs:
   TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: ubuntu-latest
     strategy:
       fail-fast: false
diff --git a/.github/workflows/nightly-Homebrew-build.yml b/.github/workflows/nightly-Homebrew-build.yml
index 96063565d..ca57fba70 100644
--- a/.github/workflows/nightly-Homebrew-build.yml
+++ b/.github/workflows/nightly-Homebrew-build.yml
@@ -18,11 +18,16 @@ on:
 
 name: Nightly-Homebrew-Build
 
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
 permissions:
   contents: read # to fetch code (actions/checkout)
 
 jobs:
   build-OpenBLAS-with-Homebrew:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
     runs-on: macos-latest
     env:
       DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer
diff --git a/.github/workflows/riscv64_vector.yml b/.github/workflows/riscv64_vector.yml
new file mode 100644
index 000000000..dd6fe9ca8
--- /dev/null
+++ b/.github/workflows/riscv64_vector.yml
@@ -0,0 +1,253 @@
+name: riscv64 zvl256b qemu test
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: ubuntu-latest
+    env:
+      triple: riscv64-unknown-linux-gnu
+      riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
+      riscv_gnu_toolchain_version: 13.2.0
+      riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: RISCV64_ZVL128B
+            opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
+            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
+          - target: RISCV64_ZVL256B
+            opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
+            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: install build deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make \
+          libgomp1-riscv64-cross ccache
+          wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
+          tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: build OpenBLAS libs
+        run: |
+            export PATH="/opt/riscv/bin:$PATH"
+            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
+            CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
+            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
+            RANLIB='ccache ${triple}-ranlib' \
+            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
+            HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
+
+      - name: build OpenBLAS tests
+        run: |
+            export PATH="/opt/riscv/bin:$PATH"
+            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
+            CC='${triple}-gcc' \
+            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
+            RANLIB='ccache ${triple}-ranlib' \
+            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
+            HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
+
+      - name: build lapack-netlib tests
+        working-directory: ./lapack-netlib/TESTING
+        run: |
+            export PATH="/opt/riscv/bin:$PATH"
+            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
+            CC='${triple}-gcc' \
+            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
+            RANLIB='ccache ${triple}-ranlib' \
+            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
+            HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
+            LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
+            LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
+            LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
+
+      - name: OpenBLAS tests
+        shell: bash
+        run: |
+          export PATH="/opt/riscv/bin:$PATH"
+          export QEMU_CPU=${{ matrix.qemu_cpu }}
+          rm -rf ./test_out
+          mkdir -p ./test_out
+          run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
+            echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
+            if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \ 
+            else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
+            RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
+          }
+          run_test test cblat1 &
+          run_test test cblat2 cblat2.dat &
+          run_test test cblat3 cblat3.dat &
+          run_test test dblat1 &
+          run_test test dblat2 dblat2.dat &
+          run_test test dblat3 dblat3.dat &
+          run_test test sblat1 &
+          run_test test sblat2 sblat2.dat &
+          run_test test sblat3 sblat3.dat &
+          run_test test zblat1 &
+          run_test test zblat2 zblat2.dat &
+          run_test test zblat3 zblat3.dat &
+          run_test ctest xccblat1 &
+          run_test ctest xccblat2 cin2 &
+          run_test ctest xccblat3 cin3 &
+          run_test ctest xdcblat1 &
+          run_test ctest xdcblat2 din2 &
+          run_test ctest xdcblat3 din3 &
+          run_test ctest xscblat1 &
+          run_test ctest xscblat2 sin2 &
+          run_test ctest xscblat3 sin3 &
+          run_test ctest xzcblat1 &
+          run_test ctest xzcblat2 zin2 &
+          run_test ctest xzcblat3 zin3 &
+          wait
+          while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
+          if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
+
+      - name: netlib tests
+        shell: bash
+        run: |
+          : # these take a very long time
+          echo "Skipping netlib tests in CI"
+          exit 0
+          : # comment out exit above to enable the tests
+          : # probably we want to identify a subset to run in CI
+          export PATH="/opt/riscv/bin:$PATH"
+          export QEMU_CPU=${{ matrix.qemu_cpu }}
+          rm -rf ./test_out
+          mkdir -p ./test_out
+          run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
+            echo "$4" >> $OUTPUT; \
+            echo "$CMD" >> $OUTPUT; \
+            qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
+            RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
+            if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
+            if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
+          }
+          run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines"  &
+          run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines"  &
+          run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines"  &
+          run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines"  &
+          run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines"  &
+          run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines"  &
+          run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines"  &
+          run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines"  &
+          run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines"  &
+          run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines"  &
+          run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines"  &
+          run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines"  &
+          run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver"  &
+          run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines"  &
+          run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix"  &
+          run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix"  &
+          run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices"  &
+          run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices"  &
+          run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines"  &
+          run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines"  &
+          run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines"  &
+          run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines"  &
+          run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines"  &
+          run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver"  &
+          run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines"  &
+          run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix"  &
+          run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix"  &
+          run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices"  &
+          run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices"  &
+          run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines"  &
+          run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines"  &
+          run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines"  &
+          run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines"  &
+          run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines"  &
+          run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver"  &
+          run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines"  &
+          run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix"  &
+          run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix"  &
+          run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices"  &
+          run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices"  &
+          run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines"  &
+          run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines"  &
+          run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines"  &
+          run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines"  &
+          run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines"  &
+          run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver"  &
+          run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines"  &
+          run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix"  &
+          run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix"  &
+          run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices"  &
+          run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices"  &
+          run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines"  &
+          run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines"  &
+          run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines"  &
+          wait
+          while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ;  done < <(grep -lZ FAIL ./test_out/*)
+          python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
+          TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
+          NUMERICAL_ERRORS=-1
+          OTHER_ERRORS=-1
+          . <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
+          if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
+          if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
diff --git a/.gitignore b/.gitignore
index 0fe20ecaa..9fa6a2c99 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,7 @@ lapack-3.4.2
 lapack-3.4.2.tgz
 lapack-netlib/make.inc
 lapack-netlib/lapacke/include/lapacke_mangling.h
+lapack-netlib/SRC/la_constants.mod
 lapack-netlib/TESTING/testing_results.txt
 lapack-netlib/INSTALL/test*
 lapack-netlib/TESTING/xeigtstc
@@ -46,6 +47,7 @@ config_last.h
 getarch
 getarch_2nd
 utest/openblas_utest
+utest/openblas_utest_ext
 ctest/xccblat1
 ctest/xccblat2
 ctest/xccblat3
@@ -71,6 +73,7 @@ test/SBLAT3.SUMM
 test/ZBLAT2.SUMM
 test/ZBLAT3.SUMM
 test/SHBLAT3.SUMM
+test/SBBLAT3.SUMM
 test/cblat1
 test/cblat2
 test/cblat3
@@ -81,6 +84,7 @@ test/sblat1
 test/sblat2
 test/sblat3
 test/test_shgemm
+test/test_sbgemm
 test/zblat1
 test/zblat2
 test/zblat3
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 502bf7a9d..9fbe878e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
 
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 23)
+set(OpenBLAS_PATCH_VERSION 26.dev)
 
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
@@ -20,8 +20,12 @@ include(CMakePackageConfigHelpers)
 #######
 option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" OFF)
 
+option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
+
 option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
 
+option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
+
 option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OFF)
 
 option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
@@ -38,6 +42,11 @@ option(USE_PERL "Use the older PERL scripts for build preparation instead of uni
 
 option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON)
 
+option(FIXED_LIBNAME "Use a non-versioned name for the library and no symbolic linking to variant names" OFF)
+
+set(LIBNAMEPREFIX "" CACHE STRING  "Add a prefix to the openblas part of the library name" )
+set(LIBNAMESUFFIX "" CACHE STRING  "Add a suffix after the openblas part of the library name" )
+
 if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
   option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
 else()
@@ -94,7 +103,7 @@ message(WARNING "CMake support is experimental. It does not yet support all buil
 include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
 include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
 
-set(OpenBLAS_LIBNAME openblas${SUFFIX64_UNDERSCORE})
+set(OpenBLAS_LIBNAME ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
 
 set(BLASDIRS interface driver/level2 driver/level3 driver/others)
 
@@ -247,20 +256,21 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
   endif()
 endif()
 
-if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS)
+# Seems that this hack doesn't required since macOS 11 Big Sur
+if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
   set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   if (NOT NOFORTRAN)
   set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
   set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
- "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
- "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+ "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
+ "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
  "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
  "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
  "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
   else ()
   set (CMAKE_C_CREATE_SHARED_LIBRARY
-   "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' "
-   "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+   "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
+   "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
    "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
   endif ()
 endif()
@@ -309,29 +319,36 @@ endif()
 
 #if (MSVC OR NOT NOFORTRAN)
 if (NOT NO_CBLAS)
+  if (NOT ONLY_CBLAS)
   # Broken without fortran on unix
-  add_subdirectory(utest)
+    add_subdirectory(utest)
+endif()
 endif()
 
 if (NOT NOFORTRAN)
+ if (NOT ONLY_CBLAS)
   # Build test and ctest
   add_subdirectory(test)
-  if (BUILD_TESTING)
+ endif()
+ if (BUILD_TESTING AND NOT BUILD_WITHOUT_LAPACK)
     add_subdirectory(lapack-netlib/TESTING)
   endif()
 endif()
   if(NOT NO_CBLAS)
+   if (NOT ONLY_CBLAS)
     add_subdirectory(ctest)
+   endif()
   endif()
   if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
     add_subdirectory(cpp_thread_test)
   endif()
 
+if (NOT FIXED_LIBNAME)
 set_target_properties(${OpenBLAS_LIBS} PROPERTIES
   VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}
   SOVERSION ${OpenBLAS_MAJOR_VERSION}
 )
-
+endif()
 if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
   if (NOT MSVC)
     target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition")
@@ -398,21 +415,106 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
     message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
   endif()
 
+  if (${BUILD_LAPACK_DEPRECATED})
+	  set (BLD 1)
+  else ()
+	  set (BLD 0)
+  endif()
+  if (${BUILD_BFLOAT16})
+	  set (BBF16 1)
+  else ()
+	  set (BBF16 0)
+  endif()
+  if (${BUILD_SINGLE})
+	  set (BS 1)
+  else ()
+	  set (BS 0)
+  endif()
+  if (${BUILD_DOUBLE})
+	  set (BD 1)
+  else ()
+	  set (BD 0)
+  endif()
+  if (${BUILD_COMPLEX})
+	  set (BC 1)
+  else ()
+	  set (BC 0)
+  endif()
+  if (${BUILD_COMPLEX16})
+	  set (BZ 1)
+  else ()
+	  set (BZ 0)
+  endif()
   if (NOT USE_PERL)
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
-    COMMAND  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
+	  COMMAND  ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
     COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
   else()
   add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
-    COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
+    COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol.pl "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BLD}" "${BBF16}" "${BS}" "${BD}" "${BC}" "${BZ}" > ${PROJECT_BINARY_DIR}/objcopy.def
     COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def  ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so
     COMMENT "renaming symbols"
     )
   endif()
 endif()
 
+if (BUILD_BENCHMARKS)
+	#find_package(OpenMP REQUIRED)
+  file(GLOB SOURCES "benchmark/*.c")
+  if (NOT USE_OPENMP)
+  file(GLOB REMFILE "benchmark/smallscaling.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  endif()
+  if (BUILD_WITHOUT_LAPACK)
+  file(GLOB REMFILE "benchmark/cholesky.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/geev.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/gesv.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/getri.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/potrf.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/spmv.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/symv.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  file(GLOB REMFILE "benchmark/linpack.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  endif()
+  if (NOT USE_GEMM3M)
+  file(GLOB REMFILE "benchmark/gemm3m.c")
+  list(REMOVE_ITEM SOURCES ${REMFILE})
+  endif()
+  foreach(source ${SOURCES})
+    get_filename_component(name ${source} NAME_WE)
+    if ((NOT ${name} STREQUAL "zdot-intel") AND (NOT ${name} STREQUAL "cula_wrapper"))
+        set(defines DEFAULT COMPLEX DOUBLE "COMPLEX\;DOUBLE")
+      foreach(define ${defines})
+        set(target_name "benchmark_${name}")
+        if (NOT "${define}" STREQUAL "DEFAULT")
+          string(JOIN "_" define_str ${define})
+          set(target_name "${target_name}_${define_str}")
+        endif()
+        if ((NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imax_COMPLEX_DOUBLE") AND
+            (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_imin_COMPLEX_DOUBLE") AND
+            (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_max_COMPLEX_DOUBLE") AND
+            (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX") AND (NOT ${target_name} STREQUAL "benchmark_min_COMPLEX_DOUBLE"))
+          add_executable(${target_name} ${source})
+          target_include_directories(${target_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
+	    target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} )
+	  #  target_link_libraries(${target_name} ${OpenBLAS_LIBNAME} OpenMP::OpenMP_C)
+          if (NOT "${define}" STREQUAL "DEFAULT")
+            target_compile_definitions(${target_name} PRIVATE ${define})
+          endif()
+        endif()
+      endforeach()
+    endif()
+  endforeach()
+endif()
 
 
 # Install project
@@ -503,7 +605,7 @@ if(NOT NO_LAPACKE)
   ADD_CUSTOM_TARGET(genlapacke
   COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
   )
-  install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
+  install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 endif()
 
 # Install pkg-config files
@@ -511,9 +613,8 @@ configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/
 install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
 
 
-# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".
 set(PN OpenBLAS)
-set(CMAKECONFIG_INSTALL_DIR "share/cmake/${PN}${SUFFIX64}")
+set(CMAKECONFIG_INSTALL_DIR "${CMAKE_INSTALL_LIBDIR}/cmake/${PN}${SUFFIX64}")
 configure_package_config_file(cmake/${PN}Config.cmake.in
                               "${CMAKE_CURRENT_BINARY_DIR}/${PN}${SUFFIX64}Config.cmake"
                               INSTALL_DESTINATION ${CMAKECONFIG_INSTALL_DIR})
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index f5e9dda91..203320826 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -23,6 +23,9 @@
   * Optimization on AMD Piledriver
   * Optimization on Intel Haswell
 
+* Chris Sidebottom <chris.sidebottom@arm.com>
+  * Optimizations and other improvements targeting AArch64
+
 ## Previous Developers
 
 * Zaheer Chothia <zaheer.chothia@gmail.com>
@@ -212,4 +215,11 @@ In chronological order:
   * [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
   
 * Pablo Romero <https://github.com/pablorcum>
-  * [2022-08] Fix building from sources for QNX
\ No newline at end of file
+  * [2022-08] Fix building from sources for QNX
+
+* Mark Seminatore <https://github.com/mseminatore>
+  * [2023-11-09] Improve Windows threading performance scaling
+  * [2024-02-09] Introduce MT_TRACE facility and improve code consistency
+
+* Dirreke <https://github.com/mseminatore>
+  * [2024-01-16] Add basic support for the CSKY architecture
diff --git a/Changelog.txt b/Changelog.txt
index aa445ae82..b6139d6b7 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,195 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.26
+ 2-Jan-2024
+
+general:
+- improved the version of openblas.pc that is created by the CMAKE build
+- fixed a CMAKE-specific build problem on older versions of MacOS
+- worked around linking problems on old versions of MacOS
+- corrected installation location of the lapacke_mangling header in CMAKE builds
+- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header
+- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading
+- imported additions and corrections from the Reference-LAPACK project:
+  - added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941)
+  - handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942)
+  - fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959)
+  - fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967)
+  - added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736)
+  - fixed several memory leaks in the LAPACK testsuite (PR 953)
+  - fixed counting of testsuite results by the Python script (PR 954)
+   
+x86-64:
+- fixed computation of CASUM on SkylakeX and newer targets in the special
+  case that AVX512 is not supported by the compiler or operating environment
+- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets
+- worked around a problem in the pre-AVX kernels for GEMV
+- sped up the thread management code on MS Windows
+
+arm64:
+- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer
+- sped up the thread management code on MS Windows
+- sped up SGEMM and DGEMM on Neoverse V1 and N1
+- sped up ?DOT on SVE-capable targets
+- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones
+- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds
+
+power:
+- improved the SGEMM kernel for POWER10
+- fixed compilation with (very) old versions of gcc
+- fixed detection of old 32bit PPC targets in CMAKE-based builds
+- added autodetection of the POWERPC 7400 subtype
+- fixed CMAKE-based compilation for PPCG4 and PPC970 targets
+
+loongarch64:
+- added and improved optimized kernels for almost all BLAS functions
+
+====================================================================
+Version 0.3.25
+ 12-Nov-2023
+
+general:
+- improved the error message shown on exceeding the maximum thread count
+- improved the code to add supplementary thread buffers in case of overflow
+- fixed a potential division by zero in ?ROTG
+- improved the ?MATCOPY functions to accept zero-sized rows or columns
+- corrected empty prototypes in function declarations
+- cleaned up unused declarations in the f2c-converted versions of the LAPACK sources
+- fixed compilation with the Cray CCE Compiler suite
+- improved link line rewriting to avoid mixed libgomp/libomp builds with clang&gfortran
+- worked around OPENMP builds with LLVM14's libomp hanging on FreeBSD
+- improved the Makefiles to require less option duplication on "make install"
+- imported the following changes from the upcoming release 3.12 of Reference-LAPACK
+  - deprecate utility functions ?GELQS and ?GEQRS (LAPACK PR 900)
+  - apply rounding up to workspace calculations done in floating point (LAPACK PR 904)
+  - avoid overflow in STGEX2/DTGEX2 (LAPACK PR 907)
+  - fix accumulation in ?LASSQ (LAPACK PR 909)
+  - fix handling of NaN values in ?GECON (LAPACK PR 926)
+  - avoid overflow in CBDSQR/ZBDSQR (LAPACK PR 927)
+  - fix poor vector orthogonalizations in ?ORBDB5/?UNBDB5 (LAPACK PR 928 & 930)
+
+x86-64:
+- fixed compile-time autodetection of AMD Ryzen3 and Ryzen4 cpus
+- fixed capability-based fallback selection for unknown cpus in DYNAMIC_ARCH
+- added AVX512 optimizations for ?ASUM on Sapphire Rapids and Cooper Lake
+
+ARM64:
+- fixed building on Apple with homebrew gcc
+- fixed building with XCODE 15
+- fixed building on A64FX and Cortex A710/X1/X2
+- increased the default buffer size for recent ARM server cpus 
+
+POWER:
+- fixed building with the IBM xlf 16.1.1 compiler
+- fixed building with IBM XL C
+- added support for DYNAMIC_ARCH builds with clang
+- fixed union declaration in the BFLOAT16 test case
+- enable optimizations for the AIX assembler on POWER10
+
+LOONGARCH64:
+- added an optimized SGEMV kernel
+- added an optimized DTRSM kernel
+
+====================================================================
+Version 0.3.24
+ 03-Sep-2023
+
+general:
+   - declared the arguments of cblas_xerbla as const (in accordance with the reference implementation 
+     and others, the previous discrepancy appears to have dated back to GotoBLAS)
+   - fixed the implementation of ?GEMMT that was added in 0.3.23
+   - made cpu-specific SWITCH_RATIO parameters for GEMM available to DYNAMIC_ARCH builds
+   - fixed application of SYMBOLSUFFIX in CMAKE builds
+   - fixed missing SSYCONVF function in the shared library
+   - fixed parallel build logic used with gmake
+   - added support for compilation with LLVM17, in particular its new Fortran compiler
+   - added support for CMAKE builds using the NVIDIA HPC compiler
+   - fixed INTERFACE64 builds with CMAKE and the f95 Fortran compiler
+   - fixed cross-build detection and management in c_check
+   - disabled building of the tests with CMAKE when ONLY_CBLAS is defined
+   - fixed several issues with the handling of runtime limits on the number of OPENMP threads
+   - corrected the error code returned by SGEADD/DGEADD when LDA is too small
+   - corrected the error code returned by IMATCOPY when LDB is too small
+   - updated ?NRM2 to support negative increment values (as introduced in release 3.10 
+     of the reference BLAS)
+   - fixed OpenMP builds with CLANG for the case where libomp is not in a standard location
+   - fixed a potential overwrite of unrelated memory during thread initialisation on startup
+   - fixed a potential integer overflow in the multithreading threshold for ?SYMM/?SYRK
+   - fixed build of the LAPACKE interfaces for the LAPACK 3.11.0 ?TRSYL functions added in 0.3.22
+   - fixed installation of .cmake files in concurrent 32 and 64bit builds with CMAKE
+   - applied additions and corrections from the development branch of Reference-LAPACK:
+   - fixed actual arguments passed to a number of LAPACK functions (from Reference-LAPACK PR 885)
+   - fixed workspace query results in LAPACK ?SYTRF/?TRECV3 (from Reference-LAPACK PR 883)
+   - fixed derivation of the UPLO parameter in LAPACKE_?larfb (from Reference-LAPACK PR 878)
+   - fixed a crash in LAPACK ?GELSDD on NRHS=0 (from Reference-LAPACK PR 876)
+   - added new LAPACK utility functions CRSCL and ZRSCL (from Reference-LAPACK PR 839)
+	- corrected the order of eigenvalues for 2x2 matrices in ?STEMR (Reference-LAPACK PR 867)
+	- removed spurious reference to OpenMP variables outside OpenMP contexts (Reference-LAPACK PR 860)
+	- updated file comments on use of LAMBDA variable in LAPACK (Reference-LAPACK PR 852)
+	- fixed documentation of LAPACK SLASD0/DLASD0 (Reference-LAPACK PR 855)
+	- fixed confusing use of "minor" in LAPACK documentation (Reference-LAPACK PR 849)
+	- added new LAPACK functions ?GEDMD for dynamic mode decomposition (Reference-LAPACK PR 736)
+	- fixed potential stack overflows in the EIG part of the LAPACK testsuite (Reference-LAPACK PR 854)
+	- applied small improvements to the variants of Cholesky and QR functions (Reference-LAPACK PR 847)
+	- removed unused variables from LAPACK ?BDSQR (Reference-LAPACK PR 832)
+	- fixed a potential crash on allocation failure in LAPACKE SGEESX/DGEESX (Reference-LAPACK PR 836)
+	- added a quick return from SLARUV/DLARUV for N < 1 (Reference-LAPACK PR 837)
+	- updated function descriptions in LAPACK ?GEGS/?GEGV (Reference-LAPACK PR 831)
+	- improved algorithm description in ?GELSY (Reference-LAPACK PR 833)
+	- fixed scaling in LAPACK STGSNA/DTGSNA (Reference-LAPACK PR 830)
+	- fixed crash in LAPACKE_?geqrt with row-major data (Reference-LAPACK PR 768)
+	- added LAPACKE interfaces for C/ZUNHR_COL and S/DORHR_COL (Reference-LAPACK PR 827)
+	- added error exit tests for SYSV/SYTD2/GEHD2 to the testsuite (Reference-LAPACK PR 795)
+	- fixed typos in LAPACK source and comments (Reference-LAPACK PRs 809,811,812,814,820)
+	- adopt refactored ?GEBAL implementation (Reference-LAPACK PR 808)
+
+x86_64:
+   - added cpu model autodetection for Intel Alder Lake N
+   - added activation of the AMX tile to the Sapphire Rapids SBGEMM kernel
+   - worked around miscompilations of GEMV/SYMV kernels by gcc's tree-vectorizer
+   - fixed compilation of Cooperlake and Sapphire Rapids kernels with CLANG
+   - fixed runtime detection of Cooperlake and Sapphire Rapids in DYNAMIC_ARCH
+   - fixed feature-based cputype fallback in DYNAMIC_ARCH
+   - added support for building the AVX512 kernels with the NVIDIA HPC compiler
+   - corrected ZAXPY result on old pre-AVX hardware for the INCX=0 case
+   - fixed a potential use of uninitialized variables in ZTRSM
+
+ARM64:
+   - added cpu model autodetection for Apple M2
+   - fixed wrong results of CGEMM/CTRMM/DNRM2 under OSX (use of reserved register)
+   - added support for building the SVE kernels with the NVIDIA HPC compiler
+   - added support for building the SVE kernels with the Apple Clang compiler
+   - fixed compiler option handling for building the SVE kernels with LLVM
+   - implemented SWITCH_RATIO parameter for improved GEMM performance on Neoverse
+   - activated SVE SGEMM and DGEMM kernels for Neoverse V1
+   - improved performance of the SVE CGEMM and ZGEMM kernels on Neoverse V1
+   - improved kernel selection for the ARMV8SVE target and added it to DYNAMIC_ARCH
+   - fixed runtime check for SVE availability in DYNAMIC_ARCH builds to take OS or
+     container restrictions into account
+   - fixed a potential use of uninitialized variables in ZTRSM
+   - fix a potential misdetection of ARMV8 hardware as 32bit in CMAKE builds
+
+LOONGARCH64:
+   - added ABI detection
+   - added support for cpu affinity handling
+   - fixed compilation with early versions of the Loongson toolchain
+   - added an optimized SGEMM kernel for 3A5000
+   - added optimized DGEMV kernels for 3A5000
+   - improved the performance of the DGEMM kernel for 3A5000
+
+MIPS64:
+   - fixed miscompilation of TRMM kernels for the MIPS64_GENERIC target
+
+POWER:
+   - fixed compiler warnings in the POWER10 SBGEMM kernel
+
+RISCV:
+   - fixed application of the INTERFACE64 option when building with CMAKE
+   - fix a potential misdetection of RISCV hardware as 32bit in CMAKE builds
+   - fixed IDAMAX and DOT kernels for C910V
+   - fixed corner cases in the ROT and SWAP kernels for C910V
+   - fixed compilation of the C910V target with recent vendor compilers
+
 ====================================================================
 Version 0.3.23
  01-Apr-2023
diff --git a/GotoBLAS_06WeirdPerformance.txt b/GotoBLAS_06WeirdPerformance.txt
index 05766e17b..0f7cec5c9 100644
--- a/GotoBLAS_06WeirdPerformance.txt
+++ b/GotoBLAS_06WeirdPerformance.txt
@@ -11,7 +11,7 @@
    operation is finished.
 
 
-2. Simlar problem may happen under virtual machine. If supervisor
+2. Similar problem may happen under virtual machine. If supervisor
    allocates different cores for each scheduling, BLAS performnace
    will be bad. This is because BLAS also utilizes all cache,
    unexpected re-schedule for different core may result of heavy
diff --git a/Jenkinsfile b/Jenkinsfile
index 2b61bed9f..baeeee59f 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,9 +1,14 @@
-node {
-        stage('Checkout') {
-            checkout
+pipeline {
+    agent { 
+        docker {
+            image 'osuosl/ubuntu-s390x'
         }
-
+    }
+    stages {
         stage('Build') {
-            sh("make")
+            steps {
+                sh 'make clean && make'
+            }
         }
+    }
 }
diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr
new file mode 100644
index 000000000..96e18b8ad
--- /dev/null
+++ b/Jenkinsfile.pwr
@@ -0,0 +1,16 @@
+pipeline {
+    agent { 
+        docker {
+            image 'osuosl/ubuntu-ppc64le'
+        }
+    }
+    stages {
+        stage('Build') {
+            steps {
+                sh 'sudo apt update'
+                sh 'sudo apt install gfortran -y'
+                sh 'make clean && make'
+            }
+        }
+    }
+}
diff --git a/Makefile b/Makefile
index 144b3400d..c04074795 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,9 @@
 TOPDIR	= .
 include ./Makefile.system
+LNCMD = ln -fs
+ifeq ($(FIXED_LIBNAME), 1)
+LNCMD = true
+endif
 
 BLASDIRS = interface driver/level2 driver/level3 driver/others
 
@@ -35,14 +39,18 @@ export NO_LAPACK
 export C_LAPACK
 endif
 
+ifeq ($(F_COMPILER),CRAY)
+LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -Og -Os,$(LAPACK_FFLAGS))
+else
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
+endif
 
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
 
 .PHONY : all libs netlib $(RELA) test ctest shared install
-.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
+.NOTPARALLEL : shared
 
-all :: libs netlib $(RELA) tests shared
+all :: tests
 	@echo
 	@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
 	@echo
@@ -130,17 +138,17 @@ shared : libs netlib $(RELA)
 ifneq ($(NO_SHARED), 1)
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
 	@$(MAKE) -C exports so
-	@ln -fs $(LIBSONAME) $(LIBPREFIX).so
-	@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
+	@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
+	@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
 ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
 	@$(MAKE) -C exports so
-	@ln -fs $(LIBSONAME) $(LIBPREFIX).so
+	@$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
 endif
 ifeq ($(OSNAME), Darwin)
 	@$(MAKE) -C exports dyn
-	@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
-	@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
+	@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib
+	@$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
 ifeq ($(OSNAME), WINNT)
 	@$(MAKE) -C exports dll
@@ -148,9 +156,12 @@ endif
 ifeq ($(OSNAME), CYGWIN_NT)
 	@$(MAKE) -C exports dll
 endif
+ifeq ($(OSNAME), AIX)
+	@$(MAKE) -C exports so
+endif
 endif
 
-tests : libs netlib $(RELA) shared
+tests : shared
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	touch $(LIBNAME)
 ifndef NO_FBLAS
@@ -206,16 +217,32 @@ ifeq ($(DYNAMIC_OLDER), 1)
 	@echo DYNAMIC_OLDER=1 >> Makefile.conf_last
 endif	
 endif
+	@echo TARGET=$(CORE) >> Makefile.conf_last
 ifdef USE_THREAD
 	@echo USE_THREAD=$(USE_THREAD) >>  Makefile.conf_last
 endif
-	@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+ifdef SMP
+ifdef NUM_THREADS
+	@echo NUM_THREADS=$(NUM_THREADS) >>  Makefile.conf_last
+else
+	@echo NUM_THREADS=$(NUM_CORES) >>  Makefile.conf_last
+endif
+endif
+ifeq ($(USE_OPENMP),1)
+	@echo USE_OPENMP=1 >>  Makefile.conf_last
+endif
+ifeq ($(INTERFACE64),1)
+	@echo INTERFACE64=1 >>  Makefile.conf_last
+endif
+	@echo THELIBNAME=$(LIBNAME) >>  Makefile.conf_last
+	@echo THELIBSONAME=$(LIBSONAME) >>  Makefile.conf_last
+	@-$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	@touch lib.grd
 
 prof : prof_blas prof_lapack
 
 prof_blas :
-	ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
 	for d in $(SUBDIRS) ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d prof || exit 1 ; \
@@ -226,7 +253,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
 endif
 
 blas :
-	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	for d in $(BLASDIRS) ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d libs || exit 1 ; \
@@ -234,7 +261,7 @@ blas :
 	done
 
 hpl :
-	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 	for d in $(BLASDIRS) ../laswp exports ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d $(@F) || exit 1 ; \
@@ -248,7 +275,7 @@ ifeq ($(DYNAMIC_ARCH), 1)
 endif
 
 hpl_p :
-	ln -fs $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME_P) $(LIBPREFIX)_p.$(LIBSUFFIX)
 	for d in $(SUBDIRS) ../laswp exports ; \
 	do if test -d $$d; then \
 	  $(MAKE) -C $$d $(@F) || exit 1 ; \
@@ -373,14 +400,15 @@ ifneq ($(CROSS), 1)
 	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
 endif
 
-lapack-runtest:
+lapack-runtest: lapack-test
 	( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
         ./testsecond; ./testdsecnd; ./testieee; ./testversion )
-	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
+	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING )
 
 
 blas-test:
 	(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
+
 	$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
 	(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
 
diff --git a/Makefile.L3 b/Makefile.L3
deleted file mode 100644
index 76586d826..000000000
--- a/Makefile.L3
+++ /dev/null
@@ -1,5089 +0,0 @@
-USE_GEMM3M = 0
-OS := $(shell uname)
-
-ifeq ($(ARCH), x86)
-USE_GEMM3M = 1
-endif
-
-ifeq ($(ARCH), x86_64)
-USE_GEMM3M = 1
-endif
-
-ifeq ($(ARCH), x86_64)
-USE_DIRECT_SGEMM = 1
-endif
-
-ifeq ($(ARCH), ia64)
-USE_GEMM3M = 1
-endif
-
-ifeq ($(ARCH), arm)
-USE_TRMM = 1
-endif
-
-ifeq ($(ARCH), arm64)
-USE_TRMM = 1
-endif
-
-ifeq ($(ARCH), riscv64)
-USE_TRMM = 1
-endif
-
-ifneq ($(DYNAMIC_ARCH), 1)
-ifeq ($(TARGET), GENERIC)
-USE_TRMM = 1
-endif
-endif
-
-ifeq ($(CORE), HASWELL)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), SKYLAKEX)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), COOPERLAKE)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), SAPPHIRERAPIDS)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), ZEN)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), POWER8)
-ifeq ($(BINARY64),1)
-USE_TRMM = 1
-endif
-endif
-
-ifeq ($(CORE), POWER9)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), POWER10)
-USE_TRMM = 1
-endif
-
-ifeq ($(ARCH), zarch)
-USE_TRMM = 1
-endif
-
-ifeq ($(CORE), Z14)
-USE_TRMM = 1
-endif
-
-ifdef USE_DIRECT_SGEMM
-ifndef SGEMMDIRECTKERNEL
-SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c
-SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
-endif
-endif
-
-ifeq ($(BUILD_BFLOAT16), 1)
-ifndef SBGEMMKERNEL
-SBGEMM_BETA = ../generic/gemm_beta.c
-SBGEMMKERNEL    = ../generic/gemmkernel_2x2.c
-SBGEMMINCOPY    = ../generic/gemm_ncopy_2.c
-SBGEMMITCOPY    = ../generic/gemm_tcopy_2.c
-SBGEMMONCOPY    = ../generic/gemm_ncopy_2.c
-SBGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
-SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-SBKERNELOBJS	+= \
-	sbgemm_kernel$(TSUFFIX).$(SUFFIX) \
-	$(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \
-	$(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ)
-endif
-
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
-SKERNELOBJS	+= \
-	sgemm_kernel$(TSUFFIX).$(SUFFIX) \
-	sgemm_beta$(TSUFFIX).$(SUFFIX) \
-	$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
-	$(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ)
-
-ifdef USE_DIRECT_SGEMM
-SKERNELOBJS += \
-	sgemm_direct$(TSUFFIX).$(SUFFIX) \
-	sgemm_direct_performant$(TSUFFIX).$(SUFFIX) 
-endif
-endif
-
-ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
-DKERNELOBJS	+= \
-	dgemm_beta$(TSUFFIX).$(SUFFIX) \
-	dgemm_kernel$(TSUFFIX).$(SUFFIX) \
-	$(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \
-	$(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ)
-endif
-
-QKERNELOBJS	+= \
-	qgemm_kernel$(TSUFFIX).$(SUFFIX) \
-	$(QGEMMINCOPYOBJ) $(QGEMMITCOPYOBJ) \
-	$(QGEMMONCOPYOBJ) $(QGEMMOTCOPYOBJ)
-
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CKERNELOBJS	+= \
-	cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \
-	cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \
-	$(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \
-	$(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ)
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZKERNELOBJS	+= \
-	zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \
-	zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \
-	$(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \
-	$(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ)
-endif
-
-XKERNELOBJS	+= \
-	xgemm_kernel_n$(TSUFFIX).$(SUFFIX) xgemm_kernel_r$(TSUFFIX).$(SUFFIX) \
-	xgemm_kernel_l$(TSUFFIX).$(SUFFIX) xgemm_kernel_b$(TSUFFIX).$(SUFFIX) \
-	$(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \
-	$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)
-
-ifeq ($(BUILD_BFLOAT16),1)
-SBBLASOBJS      += $(SBKERNELOBJS)
-endif
-SBLASOBJS	+= $(SKERNELOBJS)
-DBLASOBJS	+= $(DKERNELOBJS)
-QBLASOBJS	+= $(QKERNELOBJS)
-CBLASOBJS	+= $(CKERNELOBJS)
-ZBLASOBJS	+= $(ZKERNELOBJS)
-XBLASOBJS	+= $(XKERNELOBJS)
-
-ifeq ($(BUILD_BFLOAT16),1)
-SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
-endif
-
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
-SBLASOBJS	+= \
-	sgemm_beta$(TSUFFIX).$(SUFFIX) \
-	strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	strmm_kernel_RN$(TSUFFIX).$(SUFFIX) strmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	strsm_kernel_LN$(TSUFFIX).$(SUFFIX) strsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
-DBLASOBJS	+= \
-	dgemm_beta$(TSUFFIX).$(SUFFIX) \
-	dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) 
-endif
-
-QBLASOBJS	+= \
-	qgemm_beta$(TSUFFIX).$(SUFFIX) \
-	qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX)
-
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CBLASOBJS	+= \
-	ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX)
-endif
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CBLASOBJS	+= \
-	cgemm_beta$(TSUFFIX).$(SUFFIX) \
-	ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZBLASOBJS	+= \
-	zgemm_beta$(TSUFFIX).$(SUFFIX) \
-	ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \
-	ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) 
-endif
-
-XBLASOBJS	+= \
-	xgemm_beta$(TSUFFIX).$(SUFFIX) \
-	xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) \
-	xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
-	xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) \
-	xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) \
-	xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) 
-
-ifeq ($(USE_GEMM3M), 1)
-
-CBLASOBJS    +=  cgemm3m_kernel$(TSUFFIX).$(SUFFIX)
-ZBLASOBJS    +=  zgemm3m_kernel$(TSUFFIX).$(SUFFIX)
-XBLASOBJS    +=  xgemm3m_kernel$(TSUFFIX).$(SUFFIX)
-
-endif
-
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
-SBLASOBJS += \
-	strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_iutucopy$(TSUFFIX).$(SUFFIX) strmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_iltucopy$(TSUFFIX).$(SUFFIX) strmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_iltucopy$(TSUFFIX).$(SUFFIX) strsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(BUILD_DOUBLE),1)
-DBLASOBJS += \
-	dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_outucopy$(TSUFFIX).$(SUFFIX) dtrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	dsymm_iutcopy$(TSUFFIX).$(SUFFIX) dsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	dsymm_outcopy$(TSUFFIX).$(SUFFIX) dsymm_oltcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-QBLASOBJS += \
-	qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_outucopy$(TSUFFIX).$(SUFFIX) qtrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_outucopy$(TSUFFIX).$(SUFFIX) qtrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	qsymm_iutcopy$(TSUFFIX).$(SUFFIX) qsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	qsymm_outcopy$(TSUFFIX).$(SUFFIX) qsymm_oltcopy$(TSUFFIX).$(SUFFIX) 
-
-ifeq ($(BUILD_COMPLEX),1)
-CBLASOBJS += \
-	ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	csymm_iutcopy$(TSUFFIX).$(SUFFIX) csymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	csymm_outcopy$(TSUFFIX).$(SUFFIX) csymm_oltcopy$(TSUFFIX).$(SUFFIX) \
-	chemm_iutcopy$(TSUFFIX).$(SUFFIX) chemm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	chemm_outcopy$(TSUFFIX).$(SUFFIX) chemm_oltcopy$(TSUFFIX).$(SUFFIX)
-endif
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CBLASOBJS += \
-	ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_outucopy$(TSUFFIX).$(SUFFIX) ctrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZBLASOBJS += \
-	ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_outucopy$(TSUFFIX).$(SUFFIX) ztrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_outucopy$(TSUFFIX).$(SUFFIX) ztrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	zsymm_iutcopy$(TSUFFIX).$(SUFFIX) zsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	zsymm_outcopy$(TSUFFIX).$(SUFFIX) zsymm_oltcopy$(TSUFFIX).$(SUFFIX) \
-	zhemm_iutcopy$(TSUFFIX).$(SUFFIX) zhemm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	zhemm_outcopy$(TSUFFIX).$(SUFFIX) zhemm_oltcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-XBLASOBJS += \
-	xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_outucopy$(TSUFFIX).$(SUFFIX) xtrmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_outucopy$(TSUFFIX).$(SUFFIX) xtrsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) \
-	xsymm_iutcopy$(TSUFFIX).$(SUFFIX) xsymm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	xsymm_outcopy$(TSUFFIX).$(SUFFIX) xsymm_oltcopy$(TSUFFIX).$(SUFFIX) \
-	xhemm_iutcopy$(TSUFFIX).$(SUFFIX) xhemm_iltcopy$(TSUFFIX).$(SUFFIX) \
-	xhemm_outcopy$(TSUFFIX).$(SUFFIX) xhemm_oltcopy$(TSUFFIX).$(SUFFIX)
-
-ifeq ($(USE_GEMM3M), 1)
-
-ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
-CBLASOBJS += \
-	cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \
-	cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \
-	csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \
-	chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) chemm3m_olcopyi$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZBLASOBJS += \
-	zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \
-	zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX)
-endif
-
-XBLASOBJS += \
-	xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) \
-	xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) \
-	xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX)
-
-endif
-
-######  BLAS small matrix optimization #####
-ifeq ($(SMALL_MATRIX_OPT), 1)
-
-ifeq ($(BUILD_BFLOAT16),1)
-SBBLASOBJS += \
-	sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
-endif
-
-SBLASOBJS += \
-	sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
-
-DBLASOBJS += \
-	dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
-
-CBLASOBJS += \
-	cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
-	cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
-
-ZBLASOBJS += \
-	zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
-	zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
-
-endif
-
-######  BLAS extensions #####
-
-ifeq ($(BUILD_SINGLE),1)
-SBLASOBJS += \
-	somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	simatcopy_k_cn$(TSUFFIX).$(SUFFIX) simatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	simatcopy_k_ct$(TSUFFIX).$(SUFFIX) simatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	sgeadd_k$(TSUFFIX).$(SUFFIX) 
-endif
-ifeq ($(BUILD_DOUBLE),1)
-DBLASOBJS += \
-	domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	dgeadd_k$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_COMPLEX),1)
-CBLASOBJS += \
-	comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
-	comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
-	cimatcopy_k_cn$(TSUFFIX).$(SUFFIX)  cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	cimatcopy_k_ct$(TSUFFIX).$(SUFFIX)  cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
-	cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
-	cgeadd_k$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-ZBLASOBJS += \
-	zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
-	zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
-	zimatcopy_k_cn$(TSUFFIX).$(SUFFIX)  zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
-	zimatcopy_k_ct$(TSUFFIX).$(SUFFIX)  zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) \
-	zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \
-	zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \
-	zgeadd_k$(TSUFFIX).$(SUFFIX) 
-endif
-
-ifeq ($(BUILD_BFLOAT16), 1)
-SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-endif
-
-SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-SGEMMOTCOPYOBJ_P = $(SGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-DGEMMINCOPYOBJ_P = $(DGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-DGEMMITCOPYOBJ_P = $(DGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-DGEMMONCOPYOBJ_P = $(DGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-DGEMMOTCOPYOBJ_P = $(DGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-QGEMMINCOPYOBJ_P = $(QGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-QGEMMITCOPYOBJ_P = $(QGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-QGEMMONCOPYOBJ_P = $(QGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-QGEMMOTCOPYOBJ_P = $(QGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-CGEMMINCOPYOBJ_P = $(CGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-CGEMMITCOPYOBJ_P = $(CGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-CGEMMONCOPYOBJ_P = $(CGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-CGEMMOTCOPYOBJ_P = $(CGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-ZGEMMINCOPYOBJ_P = $(ZGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-ZGEMMITCOPYOBJ_P = $(ZGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-ZGEMMONCOPYOBJ_P = $(ZGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-ZGEMMOTCOPYOBJ_P = $(ZGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-XGEMMINCOPYOBJ_P = $(XGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
-
-ifeq ($(BUILD_BFLOAT16),1)
-$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)qgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)cgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_BETA)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
-
-ifeq ($(ARCH), E2K)
-USE_TRMM = 1
-endif
-
-
-ifeq ($(BUILD_BFLOAT16), 1)
-
-$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY)
-
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s
-	m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@
-	rm sbgemmotcopy.s sbgemmotcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
-
-$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s
-	m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@
-	rm sbgemmitcopy.s sbgemmitcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-endif
-endif
-
-$(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s
-	m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
-	rm sgemmotcopy.s sgemmotcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-endif	
-
-
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-
-$(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s
-	m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
-	rm sgemmitcopy.s sgemmitcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ 
-endif
-	
-endif
-
-$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s
-	m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
-	rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-$(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s
-	m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
-	rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-endif
-
-endif
-
-ifdef EXPRECISION
-
-$(KDIR)$(QGEMMONCOPYOBJ) : $(KERNELDIR)/$(QGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(QGEMMOTCOPYOBJ) : $(KERNELDIR)/$(QGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N))
-
-$(KDIR)$(QGEMMINCOPYOBJ) : $(KERNELDIR)/$(QGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(QGEMMITCOPYOBJ) : $(KERNELDIR)/$(QGEMMITCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-endif
-
-$(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-
-$(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s
-	m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
-	rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-endif
-
-$(KDIR)$(ZGEMMONCOPYOBJ) : $(KERNELDIR)/$(ZGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(ZGEMMOTCOPYOBJ) : $(KERNELDIR)/$(ZGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-
-$(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s
-	m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
-	rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-endif
-
-endif
-
-ifdef EXPRECISION
-
-$(KDIR)$(XGEMMONCOPYOBJ) : $(KERNELDIR)/$(XGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(XGEMMOTCOPYOBJ) : $(KERNELDIR)/$(XGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N))
-
-$(KDIR)$(XGEMMINCOPYOBJ) : $(KERNELDIR)/$(XGEMMINCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)$(XGEMMITCOPYOBJ) : $(KERNELDIR)/$(XGEMMITCOPY)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-endif
-
-$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX  $< -o - > sgemm_kernel$(TSUFFIX).s
-	m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
-	rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-ifdef USE_DIRECT_SGEMM
-$(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-$(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-ifeq ($(BUILD_BFLOAT16), 1)
-
-$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX  $< -o - > sbgemm_kernel$(TSUFFIX).s
-	m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@
-	rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-endif
-
-$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
-	m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
-	rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s
-	m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
-	rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-endif
-
-$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s
-	m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
-	rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
-endif
-
-$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC  $< -o - > cgemm_kernel_r.s
-	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
-	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
-endif
-
-$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s
-	m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
-	rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
-endif
-
-$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s
-	m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
-	rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
-else ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-endif
-
-$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s
-	m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
-	rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
-else ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
-endif
-
-$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s
-	m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
-	rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
-else ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
-endif
-
-$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s
-	m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
-	rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
-else ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
-endif
-
-$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)xgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)xgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
-
-
-ifdef USE_TRMM
-$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s	
-	m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
-	rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-endif
-
-$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s	
-	m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
-	rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s	
-	m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
-	rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-endif
-
-$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
-	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s
-	m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
-	rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s
-	m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
-	rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s
-	m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
-	rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s
-	m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
-	rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN  $< -o - > ctrmm_kernel_ln.s
-	m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
-	rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s
-	m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
-	rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s
-	m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN  ctrmm_kernel_lr_nomacros.s -o $@
-	rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s
-	m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
-	rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s
-	m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
-	rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s
-	m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
-	rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s
-	m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
-	rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-endif
-
-$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s
-	m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
-	rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s
-	m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
-	rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s
-	m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
-	rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s
-	m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
-	rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s
-	m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
-	rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s 
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s
-	m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
-	rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s
-	m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
-	rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s
-	m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
-	rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s
-	m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
-	rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
-else ifeq ($(CORE), SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-endif
-
-else
-$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
-	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-
-$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-endif
-$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-endif
-$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-ifeq ($(CORE),SANDYBRIDGE)
-	$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-endif
-endif
-
-
-
-
-$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)zgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)xgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL)
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)strsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s
-	m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
-	rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
-else
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-endif
-
-$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-
-ifdef STRMMUNCOPY_M
-$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)strmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef STRMMLNCOPY_M
-$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)strmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef STRMMUTCOPY_M
-$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)strmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef STRMMLTCOPY_M
-$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)strmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)strmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef DTRMMUNCOPY_M
-$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef DTRMMLNCOPY_M
-$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef DTRMMUTCOPY_M
-$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef DTRMMLTCOPY_M
-$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef CTRMMUNCOPY_M
-$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef CTRMMLNCOPY_M
-$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef CTRMMUTCOPY_M
-$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef CTRMMLTCOPY_M
-$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef ZTRMMUNCOPY_M
-$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRMMLNCOPY_M
-$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRMMUTCOPY_M
-$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRMMLTCOPY_M
-$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ssymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)ssymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-ifdef SSYMMUCOPY_M
-$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMUCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-else
-$(KDIR)ssymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-endif
-
-ifdef SSYMMLCOPY_M
-$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SSYMMLCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-else
-$(KDIR)ssymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-endif
-
-$(KDIR)dsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)dsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-ifdef DSYMMUCOPY_M
-$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMUCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-else
-$(KDIR)dsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-endif
-
-ifdef DSYMMLCOPY_M
-$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DSYMMLCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-else
-$(KDIR)dsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-endif
-
-$(KDIR)qsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)qsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)qsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)qsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-ifdef CSYMMUCOPY_M
-$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-else
-$(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-endif
-
-ifdef CSYMMLCOPY_M
-$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-else
-$(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-endif
-
-$(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-ifdef ZSYMMUCOPY_M
-$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-else
-$(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-endif
-
-ifdef ZSYMMLCOPY_M
-$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-else
-$(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-endif
-
-$(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)xsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)xsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)xsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-ifdef CHEMMUTCOPY_M
-$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-else
-$(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-endif
-
-ifdef CHEMMLTCOPY_M
-$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-else
-$(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-endif
-
-$(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-ifdef ZHEMMUTCOPY_M
-$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-else
-$(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-endif
-
-ifdef ZHEMMLTCOPY_M
-$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-else
-$(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-endif
-
-$(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)xhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-$(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-
-$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-
-$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-ifdef TRSMCOPYUN_M
-$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYLN_M
-$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYUT_M
-$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYLT_M
-$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef TRSMCOPYUN_M
-$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYLN_M
-$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYUT_M
-$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef TRSMCOPYLT_M
-$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef ZTRSMCOPYUN_M
-$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYLN_M
-$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYUT_M
-$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYLT_M
-$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-ifdef ZTRSMCOPYUN_M
-$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYLN_M
-$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYUT_M
-$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-endif
-
-ifdef ZTRSMCOPYLT_M
-$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M)
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-else
-$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-endif
-
-$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_outucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_outncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-
-$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifeq ($(BUILD_BFLOAT16),1)
-$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)qgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)cgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM_BETA)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
-
-
-ifeq ($(BUILD_BFLOAT16), 1)
-$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
-$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-endif
-endif
-
-$(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(SGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-
-$(SGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-$(DGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(DGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-ifdef EXPRECISION
-
-$(QGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(QGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(QGEMM_UNROLL_M), $(QGEMM_UNROLL_N))
-
-$(QGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(QGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-endif
-
-$(CGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(CGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-
-$(CGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(CGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-$(ZGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(ZGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-
-$(ZGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(ZGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-ifdef EXPRECISION
-
-$(XGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMONCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(XGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMOTCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-ifneq ($(XGEMM_UNROLL_M), $(XGEMM_UNROLL_N))
-
-$(XGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMINCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(XGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMITCOPY)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-endif
-
-endif
-
-
-ifeq ($(BUILD_BFLOAT16), 1)
-$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
-
-$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)qgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)cgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
-	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
-	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s 
-else
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
-endif
-
-$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
-
-$(KDIR)zgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)zgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)zgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)zgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
-
-$(KDIR)xgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)xgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)xgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)xgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@
-
-$(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
-	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
-else
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-endif
-
-$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
-
-$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
-
-$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
-
-$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
-
-$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
-
-$(KDIR)cgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)zgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM3MKERNEL)
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)xgemm3m_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM3MKERNEL)
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
-
-$(KDIR)strsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LN) $(STRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_LT) $(STRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RN) $(STRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)strsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(STRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)dtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RT) $(DTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LN) $(QTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_LT) $(QTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RN) $(QTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)qtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QTRSMKERNEL_RT) $(QTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -UCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LN) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_LT) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RN) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)ctrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CTRSMKERNEL_RT) $(CTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -UDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LN) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_LT) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RN) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)ztrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZTRSMKERNEL_RT) $(ZTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LN) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DLN -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_LT) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DLT -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -UCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RN) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -DUPPER -DRN -DCONJ $< -o $@
-
-$(KDIR)xtrsm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XTRSMKERNEL_RT) $(XTRSMDEPEND)
-	$(CC) -c $(PFLAGS) -DTRSMKERNEL -DCOMPLEX -DXDOUBLE -UUPPER -DRT -DCONJ $< -o $@
-
-
-$(KDIR)strmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrmm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrmm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrmm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ssymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)ssymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)ssymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)ssymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)dsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)dsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)dsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)dsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)qsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)qsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)qsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_ucopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)qsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/symm_lcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)csymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)csymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)csymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)csymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)zsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)zsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)zsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)zsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)xsymm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@
-
-$(KDIR)xsymm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@
-
-$(KDIR)xsymm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@
-
-$(KDIR)xsymm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zsymm_lcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@
-
-$(KDIR)chemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)chemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-$(KDIR)chemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-
-$(KDIR)chemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-
-$(KDIR)zhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)zhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-$(KDIR)zhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-
-$(KDIR)zhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-
-$(KDIR)xhemm_outcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@
-
-$(KDIR)xhemm_oltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@
-
-$(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@
-
-$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
-
-$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
-
-$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
-
-$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
-
-$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
-	$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
-
-$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)strsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)strsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)dtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)dtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_lncopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_utcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)qtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)qtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ctrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ctrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)ztrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)ztrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iunncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_ilnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_ilnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iutucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iutncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_iltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_iltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_M).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_ounucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_ounncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_uncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_olnucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_olnncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_lncopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_outucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_outncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_utcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER -UUNIT $< -o $@
-
-$(KDIR)xtrsm_oltucopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -DUNIT $< -o $@
-
-$(KDIR)xtrsm_oltncopy$(TSUFFIX).$(PSUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_N).c
-	$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@
-
-
-#####  BLAS extensions ######
-
-ifndef DOMATCOPY_CN
-DOMATCOPY_CN = ../arm/omatcopy_cn.c
-endif
-
-$(KDIR)domatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DOMATCOPY_RN
-DOMATCOPY_RN = ../arm/omatcopy_rn.c
-endif
-
-$(KDIR)domatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef DOMATCOPY_CT
-DOMATCOPY_CT = ../arm/omatcopy_ct.c
-endif
-
-$(KDIR)domatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DOMATCOPY_RT
-DOMATCOPY_RT = ../arm/omatcopy_rt.c
-endif
-
-$(KDIR)domatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DOMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef DIMATCOPY_CN
-DIMATCOPY_CN = ../generic/imatcopy_cn.c
-endif
-
-$(KDIR)dimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DIMATCOPY_RN
-DIMATCOPY_RN = ../generic/imatcopy_rn.c
-endif
-
-$(KDIR)dimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef DIMATCOPY_CT
-DIMATCOPY_CT = ../generic/imatcopy_ct.c
-endif
-
-$(KDIR)dimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DIMATCOPY_RT
-DIMATCOPY_RT = ../generic/imatcopy_rt.c
-endif
-
-$(KDIR)dimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DIMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef SOMATCOPY_CN
-SOMATCOPY_CN = ../arm/omatcopy_cn.c
-endif
-
-$(KDIR)somatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef SOMATCOPY_RN
-SOMATCOPY_RN = ../arm/omatcopy_rn.c
-endif
-
-$(KDIR)somatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef SOMATCOPY_CT
-SOMATCOPY_CT = ../arm/omatcopy_ct.c
-endif
-
-$(KDIR)somatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef SOMATCOPY_RT
-SOMATCOPY_RT = ../arm/omatcopy_rt.c
-endif
-
-$(KDIR)somatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SOMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef SIMATCOPY_CN
-SIMATCOPY_CN = ../generic/imatcopy_cn.c
-endif
-
-$(KDIR)simatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef SIMATCOPY_RN
-SIMATCOPY_RN = ../generic/imatcopy_rn.c
-endif
-
-$(KDIR)simatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-ifndef SIMATCOPY_CT
-SIMATCOPY_CT = ../generic/imatcopy_ct.c
-endif
-
-$(KDIR)simatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef SIMATCOPY_RT
-SIMATCOPY_RT = ../generic/imatcopy_rt.c
-endif
-
-$(KDIR)simatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SIMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DROWM $< -o $@
-
-
-ifndef COMATCOPY_CN
-COMATCOPY_CN = ../arm/zomatcopy_cn.c
-endif
-
-$(KDIR)comatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef COMATCOPY_RN
-COMATCOPY_RN = ../arm/zomatcopy_rn.c
-endif
-
-$(KDIR)comatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef COMATCOPY_CT
-COMATCOPY_CT = ../arm/zomatcopy_ct.c
-endif
-
-$(KDIR)comatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef COMATCOPY_RT
-COMATCOPY_RT = ../arm/zomatcopy_rt.c
-endif
-
-$(KDIR)comatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef COMATCOPY_CNC
-COMATCOPY_CNC = ../arm/zomatcopy_cnc.c
-endif
-
-$(KDIR)comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CNC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef COMATCOPY_RNC
-COMATCOPY_RNC = ../arm/zomatcopy_rnc.c
-endif
-
-$(KDIR)comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RNC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef COMATCOPY_CTC
-COMATCOPY_CTC = ../arm/zomatcopy_ctc.c
-endif
-
-$(KDIR)comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_CTC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef COMATCOPY_RTC
-COMATCOPY_RTC = ../arm/zomatcopy_rtc.c
-endif
-
-$(KDIR)comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(COMATCOPY_RTC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef CIMATCOPY_CN
-CIMATCOPY_CN = ../generic/zimatcopy_cn.c
-endif
-
-$(KDIR)cimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef CIMATCOPY_RN
-CIMATCOPY_RN = ../generic/zimatcopy_rn.c
-endif
-
-$(KDIR)cimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef CIMATCOPY_CT
-CIMATCOPY_CT = ../generic/zimatcopy_ct.c
-endif
-
-$(KDIR)cimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef CIMATCOPY_RT
-CIMATCOPY_RT = ../generic/zimatcopy_rt.c
-endif
-
-$(KDIR)cimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef CIMATCOPY_CNC
-CIMATCOPY_CNC = ../generic/zimatcopy_cnc.c
-endif
-
-$(KDIR)cimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CNC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef CIMATCOPY_RNC
-CIMATCOPY_RNC = ../generic/zimatcopy_rnc.c
-endif
-
-$(KDIR)cimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RNC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef CIMATCOPY_CTC
-CIMATCOPY_CTC = ../generic/zimatcopy_ctc.c
-endif
-
-$(KDIR)cimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_CTC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef CIMATCOPY_RTC
-CIMATCOPY_RTC = ../generic/zimatcopy_rtc.c
-endif
-
-$(KDIR)cimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CIMATCOPY_RTC)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-
-
-ifndef ZOMATCOPY_CN
-ZOMATCOPY_CN = ../arm/zomatcopy_cn.c
-endif
-
-$(KDIR)zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef ZOMATCOPY_RN
-ZOMATCOPY_RN = ../arm/zomatcopy_rn.c
-endif
-
-$(KDIR)zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef ZOMATCOPY_CT
-ZOMATCOPY_CT = ../arm/zomatcopy_ct.c
-endif
-
-$(KDIR)zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef ZOMATCOPY_RT
-ZOMATCOPY_RT = ../arm/zomatcopy_rt.c
-endif
-
-$(KDIR)zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef ZOMATCOPY_CNC
-ZOMATCOPY_CNC = ../arm/zomatcopy_cnc.c
-endif
-
-$(KDIR)zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CNC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef ZOMATCOPY_RNC
-ZOMATCOPY_RNC = ../arm/zomatcopy_rnc.c
-endif
-
-$(KDIR)zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RNC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef ZOMATCOPY_CTC
-ZOMATCOPY_CTC = ../arm/zomatcopy_ctc.c
-endif
-
-$(KDIR)zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_CTC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef ZOMATCOPY_RTC
-ZOMATCOPY_RTC = ../arm/zomatcopy_rtc.c
-endif
-
-$(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef ZIMATCOPY_CN
-ZIMATCOPY_CN = ../generic/zimatcopy_cn.c
-endif
-
-$(KDIR)zimatcopy_k_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef ZIMATCOPY_RN
-ZIMATCOPY_RN = ../generic/zimatcopy_rn.c
-endif
-
-$(KDIR)zimatcopy_k_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef ZIMATCOPY_CT
-ZIMATCOPY_CT = ../generic/zimatcopy_ct.c
-endif
-
-$(KDIR)zimatcopy_k_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -UCONJ $< -o $@
-
-ifndef ZIMATCOPY_RT
-ZIMATCOPY_RT = ../generic/zimatcopy_rt.c
-endif
-
-$(KDIR)zimatcopy_k_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -UCONJ $< -o $@
-
-ifndef ZIMATCOPY_CNC
-ZIMATCOPY_CNC = ../generic/zimatcopy_cnc.c
-endif
-
-$(KDIR)zimatcopy_k_cnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CNC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef ZIMATCOPY_RNC
-ZIMATCOPY_RNC = ../generic/zimatcopy_rnc.c
-endif
-
-$(KDIR)zimatcopy_k_rnc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RNC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-ifndef ZIMATCOPY_CTC
-ZIMATCOPY_CTC = ../generic/zimatcopy_ctc.c
-endif
-
-$(KDIR)zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_CTC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM -DCONJ $< -o $@
-
-ifndef ZIMATCOPY_RTC
-ZIMATCOPY_RTC = ../generic/zimatcopy_rtc.c
-endif
-
-$(KDIR)zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZIMATCOPY_RTC)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@
-
-
-ifndef SGEADD_K
-SGEADD_K = ../generic/geadd.c
-endif
-
-$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef DGEADD_K
-DGEADD_K = ../generic/geadd.c
-endif
-
-$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEADD_K)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@
-
-ifndef CGEADD_K
-CGEADD_K = ../generic/zgeadd.c
-endif
-
-$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@
-
-ifndef ZGEADD_K
-ZGEADD_K = ../generic/zgeadd.c
-endif
-
-$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
-
-
-
-######  BLAS small matrix optimization #####
-
-ifndef DGEMM_SMALL_M_PERMIT
-DGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
-endif
-
-ifndef DGEMM_SMALL_K_NN
-DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef DGEMM_SMALL_K_NT
-DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef DGEMM_SMALL_K_TN
-DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef DGEMM_SMALL_K_TT
-DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)dgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
-
-ifndef DGEMM_SMALL_K_B0_NN
-DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef DGEMM_SMALL_K_B0_NT
-DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef DGEMM_SMALL_K_B0_TN
-DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef DGEMM_SMALL_K_B0_TT
-DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-ifndef SGEMM_SMALL_M_PERMIT
-SGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
-endif
-
-ifndef SGEMM_SMALL_K_NN
-SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef SGEMM_SMALL_K_NT
-SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef SGEMM_SMALL_K_TN
-SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef SGEMM_SMALL_K_TT
-SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
-ifndef SGEMM_SMALL_K_B0_NN
-SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef SGEMM_SMALL_K_B0_NT
-SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef SGEMM_SMALL_K_B0_TN
-SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef SGEMM_SMALL_K_B0_TT
-SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-
-ifeq ($(BUILD_BFLOAT16), 1)
-ifndef SBGEMM_SMALL_M_PERMIT
-SBGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
-endif
-
-ifndef SBGEMM_SMALL_K_NN
-SBGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef SBGEMM_SMALL_K_NT
-SBGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef SBGEMM_SMALL_K_TN
-SBGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef SBGEMM_SMALL_K_TT
-SBGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)sbgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sbgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sbgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sbgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-$(KDIR)sbgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-
-ifndef SBGEMM_SMALL_K_B0_NN
-SBGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
-endif
-
-ifndef SBGEMM_SMALL_K_B0_NT
-SBGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
-endif
-
-ifndef SBGEMM_SMALL_K_B0_TN
-SBGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
-endif
-
-ifndef SBGEMM_SMALL_K_B0_TT
-SBGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)sbgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sbgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-
-$(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
-endif
-
-ifndef CGEMM_SMALL_M_PERMIT
-CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
-endif
-
-ifndef CGEMM_SMALL_K_NN
-CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
-endif
-
-ifndef CGEMM_SMALL_K_NT
-CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
-endif
-
-ifndef CGEMM_SMALL_K_TN
-CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
-endif
-
-ifndef CGEMM_SMALL_K_TT
-CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)cgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX $< -o $@
-
-$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
-	
-$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
-
-$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
-
-$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
-
-$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
-
-$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
-
-$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $@
-
-$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
-
-$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
-
-$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $@
-
-$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
-
-$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
-
-$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
-
-$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
-
-ifndef CGEMM_SMALL_K_B0_NN
-CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
-endif
-
-ifndef CGEMM_SMALL_K_B0_NT
-CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
-endif
-
-ifndef CGEMM_SMALL_K_B0_TN
-CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
-endif
-
-ifndef CGEMM_SMALL_K_B0_TT
-CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
-	
-$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
-
-$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
-
-ifndef ZGEMM_SMALL_M_PERMIT
-ZGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
-endif
-
-ifndef ZGEMM_SMALL_K_NN
-ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
-endif
-
-ifndef ZGEMM_SMALL_K_NT
-ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
-endif
-
-ifndef ZGEMM_SMALL_K_TN
-ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
-endif
-
-ifndef ZGEMM_SMALL_K_TT
-ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)zgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_M_PERMIT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX $< -o $@
-
-
-$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
-	
-$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
-
-$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
-
-$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
-
-$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
-
-$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
-
-$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
-
-$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $@
-
-$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
-
-$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
-
-$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
-
-$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $@
-
-$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
-
-$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
-
-$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
-
-$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
-
-ifndef ZGEMM_SMALL_K_B0_NN
-ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_nn.c
-endif
-
-ifndef ZGEMM_SMALL_K_B0_NT
-ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_nt.c
-endif
-
-ifndef ZGEMM_SMALL_K_B0_TN
-ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_tn.c
-endif
-
-ifndef ZGEMM_SMALL_K_B0_TT
-ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_tt.c
-endif
-
-$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN -DB0 $< -o $@
-	
-$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC=RC -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR=CR -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT -DB0 $< -o $@
-
-$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC -DB0 $< -o $@
diff --git a/Makefile.arm64 b/Makefile.arm64
index 064e84cbb..ca053b03d 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -69,7 +69,7 @@ endif
 # in GCC>=9
 ifeq ($(CORE), NEOVERSEN1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
@@ -92,26 +92,37 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEV1)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
-CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.4-a+sve
+ifeq (1, $(ISCLANG))
+CCOMMON_OPT += -mtune=cortex-x1
+else
+CCOMMON_OPT += -mtune=neoverse-v1
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
 endif
 else
-CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
+CCOMMON_OPT += -march=armv8.4-a+sve 
+ifneq ($(CROSS), 1)
+CCOMMON_OPT += -mtune=native
+endif
 ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=armv8.4-a -mtune=native
+FCOMMON_OPT += -march=armv8.4-a 
+ifneq ($(CROSS), 1)
+FCOMMON_OPT += -mtune=native
+endif
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif
@@ -122,30 +133,36 @@ endif
 # in GCC>=10.4
 ifeq ($(CORE), NEOVERSEN2)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ10), 1)
-ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 endif
 else
-CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
+CCOMMON_OPT += -march=armv8.5-a+sve
+ifneq ($(CROSS), 1)
+CCOMMON_OPT += -mtune=native
+endif
 ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=armv8.5-a -mtune=native
+FCOMMON_OPT += -march=armv8.5-a 
+ifneq ($(CROSS), 1)
+FCOMMON_OPT += -mtune=native
+endif
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif
@@ -155,7 +172,7 @@ endif
 # Use a53 tunings because a55 is only available in GCC>=8.1
 ifeq ($(CORE), CORTEXA55)
 ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
-ifeq ($(GCCVERSIONGTEQ8), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ8) $(ISCLANG)))
 CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
@@ -196,8 +213,13 @@ endif
 endif
 
 ifeq ($(CORE), THUNDERX3T110)
-ifeq ($(GCCVERSIONGTEQ10), 1)
-CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG)))
+CCOMMON_OPT += -march=armv8.3-a 
+ifeq (0, $(ISCLANG))
+CCOMMON_OPT += -mtune=thunderx3t110
+else
+CCOMMON_OPT += -mtune=thunderx2t99
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
 endif
@@ -225,9 +247,12 @@ endif
 endif
 endif
 
-ifeq ($(GCCVERSIONGTEQ9), 1)
+ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG)))
 ifeq ($(CORE), EMAG8180)
-CCOMMON_OPT += -march=armv8-a -mtune=emag
+CCOMMON_OPT += -march=armv8-a
+ifeq  ($(ISCLANG), 0)
+CCOMMON_OPT += -mtune=emag
+endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=emag
 endif
diff --git a/Makefile.csky b/Makefile.csky
new file mode 100644
index 000000000..36162af2a
--- /dev/null
+++ b/Makefile.csky
@@ -0,0 +1,4 @@
+ifeq ($(CORE), CK860FV)
+CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
+FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static
+endif
diff --git a/Makefile.install b/Makefile.install
index 01899b970..b2bc5aa41 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -2,6 +2,18 @@ TOPDIR	= .
 export GOTOBLAS_MAKEFILE = 1
 -include $(TOPDIR)/Makefile.conf_last
 include ./Makefile.system
+LNCMD = ln -fs
+
+ifdef THELIBNAME
+LIBNAME=$(THELIBNAME)
+LIBSONAME=$(THELIBSONAME)
+endif
+ifeq ($(FIXED_LIBNAME), 1)
+LNCMD = true
+endif
+ifeq ($(INTERFACE64),1)
+USE_64BITINT=1
+endif
 
 PREFIX ?= /opt/OpenBLAS
 
@@ -91,7 +103,7 @@ ifneq ($(NO_STATIC),1)
 	@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 endif
 #for install shared library
 ifneq ($(NO_SHARED),1)
@@ -99,21 +111,21 @@ ifneq ($(NO_SHARED),1)
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
 	@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
 
 ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
 	@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so
 endif
 ifeq ($(OSNAME), Darwin)
 	@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
-	ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
+	$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
+	$(LNCMD) $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
 ifeq ($(OSNAME), WINNT)
 	@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@@ -141,15 +153,15 @@ ifneq ($(NO_STATIC),1)
 	@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
+	$(LNCMD) $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
 endif
 #for install shared library
 ifneq ($(NO_SHARED),1)
 	@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
 	@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
 	@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
-	ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so ; \
+	$(LNCMD) $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
 
 endif
@@ -162,6 +174,8 @@ endif
 
 	@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
 	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
+	@echo 'libprefix='$(LIBNAMEPREFIX) >> "$(PKGFILE)"
+	@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
 	@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
 	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
@@ -178,7 +192,7 @@ endif
 ifneq ($(NO_SHARED),1)
 #ifeq logical or
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
-	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
+	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX)$(SYMBOLSUFFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
 endif
 ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
 	@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
diff --git a/Makefile.power b/Makefile.power
index 28a0bae08..3fa6d6faf 100644
--- a/Makefile.power
+++ b/Makefile.power
@@ -11,11 +11,23 @@ endif
 
 ifeq ($(CORE), POWER10)
 ifneq ($(C_COMPILER), PGI)
+ifeq ($(C_COMPILER), GCC)
+ifeq ($(GCCVERSIONGTEQ10), 1)
+CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+else ifneq ($(GCCVERSIONGT4), 1)
+$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
+CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
+else
+$(warning your compiler is too old to fully support POWER10, getting a newer version of gcc is recommended)
+CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math
+endif
+else
 CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
+endif
 ifeq ($(F_COMPILER), IBM)
-FCOMMON_OPT += -O2 -qrecur -qnosave
+FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr10 -qtune=pwr10 -qfloat=nomaf -qzerosize
 else
-FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10  -fno-fast-math
+FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
 endif
 endif
 endif
@@ -38,19 +50,18 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
 endif
 ifneq ($(F_COMPILER), PGI)
 ifeq ($(F_COMPILER), IBM)
-FCOMMON_OPT += -O2 -qrecur -qnosave
+FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr9 -qtune=pwr9 -qfloat=nomaf -qzerosize
 else
-FCOMMON_OPT += -O2 -frecursive -fno-fast-math
+FCOMMON_OPT += -O2 -frecursive -fno-fast-math -mcpu=power9 -mtune=power9
 endif
-ifeq ($(C_COMPILER), GCC)
+
+ifeq ($(F_COMPILER), GFORTRAN)
 ifneq ($(GCCVERSIONGT4), 1)
 $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
 FCOMMON_OPT += -mcpu=power8 -mtune=power8 
 else
 FCOMMON_OPT += -mcpu=power9 -mtune=power9 
 endif
-else
-FCOMMON_OPT += -mcpu=power9 -mtune=power9
 endif
 else
 FCOMMON_OPT += -O2 -Mrecursive
@@ -66,12 +77,16 @@ endif
 ifneq ($(F_COMPILER), PGI)
 ifeq ($(OSNAME), AIX)
 ifeq ($(F_COMPILER), IBM)
-FCOMMON_OPT += -O2 -qrecur -qnosave
+FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize
 else
-FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
+FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
 endif
 else
-FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8  -fno-fast-math 
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -O2 -qrecur -qnosave -qarch=pwr8 -qtune=pwr8 -qfloat=nomaf -qzerosize
+else
+FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
+endif
 endif
 else
 FCOMMON_OPT += -O2 -Mrecursive
@@ -84,13 +99,20 @@ CCOMMON_OPT += -DUSE_OPENMP -fopenmp
 else
 CCOMMON_OPT += -DUSE_OPENMP -mp
 endif
+ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -DUSE_OPENMP 
+else
 ifneq ($(F_COMPILER), PGI)
 FCOMMON_OPT += -DUSE_OPENMP -fopenmp
 else
 FCOMMON_OPT += -DUSE_OPENMP -mp
 endif
 endif
+endif
 
+ifeq ($(C_COMPILER), CLANG)
+CCOMMON_OPT += -fno-integrated-as
+endif
 # workaround for C->FORTRAN ABI violation in LAPACKE
 ifeq ($(F_COMPILER), GFORTRAN)
 FCOMMON_OPT += -fno-optimize-sibling-calls
@@ -125,8 +147,19 @@ endif
 ifdef BINARY64
 
 
+ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), GCCIBMAIX)
+$(error Using GCC and XLF on AIX is not a supported combination.)
+endif
+ifeq ($(C_COMPILER)$(F_COMPILER)$(OSNAME), CLANGGFORTRANAIX)
+$(error Using Clang and gFortran on AIX is not a supported combination.)
+endif
+
 ifeq ($(OSNAME), AIX)
+ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT	+= -mpowerpc64 -maix64
+else
+CCOMMON_OPT	+= -m64
+endif
 ifeq ($(COMPILER_F77), g77)
 FCOMMON_OPT	+= -mpowerpc64 -maix64
 endif
diff --git a/Makefile.prebuild b/Makefile.prebuild
index 0be4f1274..b7d695a75 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V)
 TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
 endif
 
+ifeq ($(TARGET), CK860FV)
+TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
+endif
+
+ifeq ($(TARGET), x280)
+TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
+endif
+
+ifeq ($(TARGET), RISCV64_ZVL256B)
+TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+endif
+
+ifeq ($(TARGET), RISCV64_ZVL128B)
+TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+endif
+
+ifeq ($(TARGET), RISCV64_GENERIC)
+TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
+endif
+
 all: getarch_2nd
 	./getarch_2nd  0 >> $(TARGET_MAKE)
 	./getarch_2nd  1 >> $(TARGET_CONF)
diff --git a/Makefile.riscv64 b/Makefile.riscv64
index ce91e03ec..113cc57c5 100644
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -2,3 +2,19 @@ ifeq ($(CORE), C910V)
 CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
 endif
+ifeq ($(CORE), x280)
+CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
+FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
+endif
+ifeq ($(CORE), RISCV64_ZVL256B)
+CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
+endif
+ifeq ($(CORE), RISCV64_ZVL128B)
+CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d 
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
+endif
+ifeq ($(CORE), RISCV64_GENERIC)
+CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
+endif
diff --git a/Makefile.rule b/Makefile.rule
index ab46fd075..02ec739fd 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,12 @@
 #
 
 # This library's version
-VERSION = 0.3.23
+VERSION = 0.3.26.dev
+
+# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
+# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
+# 
+# LIBNAMEPREFIX = scipy
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
diff --git a/Makefile.system b/Makefile.system
index 343b94bb3..aadf3459a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -277,10 +277,6 @@ endif
 ifndef GOTOBLAS_MAKEFILE
 export GOTOBLAS_MAKEFILE = 1
 
-# Determine if the assembler is GNU Assembler
-HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
-GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
-
 # Generating Makefile.conf and config.h
 DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
 
@@ -384,6 +380,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
 endif
 
+ifeq ($(C_COMPILER), CLANG)
+CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
+CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
+endif
+
 #
 #  OS dependent settings
 #
@@ -392,11 +393,22 @@ ifeq ($(OSNAME), Darwin)
 ifndef MACOSX_DEPLOYMENT_TARGET
 ifeq ($(ARCH), arm64)
 export MACOSX_DEPLOYMENT_TARGET=11.0
+ifeq ($(C_COMPILER), GCC)
+export NO_SVE = 1
+endif
 else
 export MACOSX_DEPLOYMENT_TARGET=10.8
 endif
 endif
 MD5SUM = md5 -r
+XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.Xcode |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
+ifeq (x$(XCVER)x,xx)
+XCVER = $(shell pkgutil --pkg-info=com.apple.pkg.CLTools_Executables |awk '/version:/ {print $2}'|cut -d: -f2|cut -f1 -d.)
+endif
+ifeq (x$(XCVER), x 15)
+CCOMMON_OPT += -Wl,-ld_classic
+FCOMMON_OPT += -Wl,-ld_classic
+endif
 endif
 
 ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
@@ -597,6 +609,9 @@ endif
 
 ifeq ($(C_COMPILER), CLANG)
 CCOMMON_OPT    += -fopenmp
+ifeq ($(F_COMPILER), GFORTRAN)
+FEXTRALIB := $(subst -lgomp,-lomp,$(FEXTRALIB))
+endif
 endif
 
 ifeq ($(C_COMPILER), INTEL)
@@ -645,7 +660,7 @@ DYNAMIC_CORE += HASWELL ZEN
 endif
 ifneq ($(NO_AVX512), 1)
 ifneq ($(NO_AVX2), 1)
-DYNAMIC_CORE += SKYLAKEX COOPERLAKE
+DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
 endif
 endif
 endif
@@ -662,15 +677,12 @@ ifeq ($(ARCH), arm64)
 DYNAMIC_CORE =  ARMV8
 DYNAMIC_CORE += CORTEXA53
 DYNAMIC_CORE += CORTEXA57
-DYNAMIC_CORE += CORTEXA72
-DYNAMIC_CORE += CORTEXA73
 DYNAMIC_CORE += NEOVERSEN1
 ifneq ($(NO_SVE), 1)
 DYNAMIC_CORE += NEOVERSEV1
 DYNAMIC_CORE += NEOVERSEN2
+DYNAMIC_CORE += ARMV8SVE
 endif
-DYNAMIC_CORE += CORTEXA55
-DYNAMIC_CORE += FALKOR
 DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
 DYNAMIC_CORE += TSV110
@@ -744,7 +756,11 @@ DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
+ifeq ($(OSNAME), AIX)
+LDVERSIONGTEQ35 := 1
+else
 LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
+endif
 ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
 CCOMMON_OPT += -DHAVE_P10_SUPPORT
@@ -857,6 +873,11 @@ endif
 endif
 endif
 
+ifeq ($(ARCH), csky)
+NO_BINARY_MODE	= 1
+BINARY_DEFINED	= 1
+endif
+
 #
 #  C Compiler dependent settings
 #
@@ -932,8 +953,12 @@ BINARY_DEFINED = 1
 endif
 
 ifeq ($(ARCH), loongarch64)
-CCOMMON_OPT += -march=loongarch64 -mabi=lp64
-FCOMMON_OPT += -march=loongarch64 -mabi=lp64
+LA64_ABI=$(shell $(CC) -mabi=lp64d -c $(TOPDIR)/cpuid_loongarch64.c -o /dev/null > /dev/null 2> /dev/null && echo lp64d)
+ifneq ($(LA64_ABI), lp64d)
+LA64_ABI=lp64
+endif
+CCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
+FCOMMON_OPT += -march=loongarch64 -mabi=$(LA64_ABI)
 endif
 
 endif
@@ -1082,8 +1107,9 @@ endif
 endif
 endif
 
-ifeq ($(F_COMPILER), GFORTRAN)
+ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
 CCOMMON_OPT += -DF_INTERFACE_GFORT
+ifeq ($(F_COMPILER), GFORTRAN)
 FCOMMON_OPT += -Wall
 # make single-threaded LAPACK calls thread-safe #1847
 FCOMMON_OPT += -frecursive
@@ -1097,6 +1123,7 @@ EXTRALIB += -lgfortran
 endif
 endif
 endif
+endif
 ifdef NO_BINARY_MODE
 ifeq ($(ARCH), $(filter $(ARCH),mips64))
 ifdef BINARY64
@@ -1152,6 +1179,10 @@ endif
 
 ifeq ($(F_COMPILER), IBM)
 CCOMMON_OPT += -DF_INTERFACE_IBM
+FEXTRALIB += -lxlf90
+ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG))
+FCOMMON_OPT += -qextname
+endif
 # FCOMMON_OPT	+= -qarch=440
 ifdef BINARY64
 FCOMMON_OPT += -q64
@@ -1348,6 +1379,8 @@ ifeq ($(F_COMPILER), SUN)
 FCOMMON_OPT  += -pic
 else ifeq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -PIC
+else ifeq ($(F_COMPILER), IBM)
+FCOMMON_OPT += -qpic=large
 else
 FCOMMON_OPT += -fPIC
 endif
@@ -1483,16 +1516,28 @@ ifndef LIBSONAMEBASE
 LIBSONAMEBASE = openblas
 endif
 
+ifndef LIBNAMEPREFIX
+LIBNAMEPREFIX =
+endif
+
+SYMPREFIX=$(SYMBOLPREFIX)
+ifeq ($(SYMBOLPREFIX),$(LIBNAMEPREFIX))
+SYMPREFIX=
+endif
+SYMSUFFIX=$(SYMBOLSUFFIX)
+ifeq ($(SYMBOLSUFFIX),$(LIBNAMESUFFIX))
+SYMSUFFIX=
+endif
 ifndef LIBNAMESUFFIX
-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
+LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)
 else
-LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
+LIBNAMEBASE = $(SYMPREFIX)$(LIBSONAMEBASE)$(SYMSUFFIX)$(LIBNAMESUFFIX)
 endif
 
 ifeq ($(OSNAME), CYGWIN_NT)
-LIBPREFIX = cyg$(LIBNAMEBASE)
+LIBPREFIX = cyg$(LIBNAMEPREFIX)$(LIBNAMEBASE)
 else
-LIBPREFIX = lib$(LIBNAMEBASE)
+LIBPREFIX = lib$(LIBNAMEPREFIX)$(LIBNAMEBASE)
 endif
 
 KERNELDIR	= $(TOPDIR)/kernel/$(ARCH)
@@ -1600,9 +1645,11 @@ override FPFLAGS    += $(FCOMMON_OPT) $(COMMON_PROF)
 
 ifeq ($(NEED_PIC), 1)
 ifeq (,$(findstring PIC,$(FFLAGS)))
+ifneq ($(F_COMPILER),IBM)
 override FFLAGS += -fPIC
 endif
 endif
+endif
 
 #For LAPACK Fortran codes.
 #Disable -fopenmp for LAPACK Fortran codes on Windows.
@@ -1616,11 +1663,11 @@ endif
 
 ifeq ($(F_COMPILER),NAG)
 LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
-FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
+override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
 endif
 ifeq ($(F_COMPILER),CRAY)
 LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
-FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
+override FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
 endif
 
 LAPACK_CFLAGS = $(CFLAGS)
@@ -1669,14 +1716,14 @@ LIBNAME_P	= $(LIBPREFIX)p$(REVISION)_p.$(LIBSUFFIX)
 endif
 endif
 
+ifeq ($(FIXED_LIBNAME),1)
+	LIBNAME = lib$(LIBNAMEPREFIX)$(LIBSONAMEBASE)$(LIBNAMESUFFIX).$(LIBSUFFIX)
+	LIBNAME_P = lib$(LIBNAMEPREFIX)$(LISOBNAMEBASE)$(LIBNAMESUFFIX)_p.$(LIBSUFFIX)
+endif
 
 LIBDLLNAME   = $(LIBPREFIX).dll
 IMPLIBNAME   = lib$(LIBNAMEBASE).dll.a
-ifneq ($(OSNAME), AIX)
 LIBSONAME    = $(LIBNAME:.$(LIBSUFFIX)=.so)
-else
-LIBSONAME    = $(LIBNAME:.$(LIBSUFFIX)=.a)
-endif
 LIBDYNNAME   = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
 LIBDEFNAME   = $(LIBNAME:.$(LIBSUFFIX)=.def)
 LIBEXPNAME   = $(LIBNAME:.$(LIBSUFFIX)=.exp)
@@ -1763,6 +1810,8 @@ export TARGET_CORE
 export NO_AVX512
 export NO_AVX2
 export BUILD_BFLOAT16
+export NO_LSX
+export NO_LASX
 
 export SBGEMM_UNROLL_M
 export SBGEMM_UNROLL_N
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 7ab331b1f..c0dbe84c8 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -75,18 +75,31 @@ endif
 ifeq ($(CORE), COOPERLAKE)
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
-# cooperlake support was added in 10.1
-ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
-CCOMMON_OPT += -march=cooperlake
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=cooperlake
-endif
-else  # gcc not support, fallback to avx512
-CCOMMON_OPT += -march=skylake-avx512
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=skylake-avx512
-endif
-endif
+ # cooperlake support was added in 10.1
+ ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
+  CCOMMON_OPT += -march=cooperlake
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=cooperlake
+  endif
+ else  # gcc not support, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
+else ifeq ($(C_COMPILER), CLANG)
+ # cooperlake support was added in clang 9
+ ifeq ($(CLANGVERSIONGTEQ9), 1)
+  CCOMMON_OPT += -march=cooperlake
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=cooperlake
+  endif
+ else  # not supported in clang, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
 endif
 ifeq ($(OSNAME), CYGWIN_NT)
 CCOMMON_OPT += -fno-asynchronous-unwind-tables
@@ -104,18 +117,31 @@ endif
 ifeq ($(CORE), SAPPHIRERAPIDS)
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
-# sapphire rapids support was added in 11
-ifeq ($(GCCVERSIONGTEQ11), 1)
-CCOMMON_OPT += -march=sapphirerapids
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=sapphirerapids
-endif
-else  # gcc not support, fallback to avx512
-CCOMMON_OPT += -march=skylake-avx512
-ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=skylake-avx512
-endif
-endif
+ # sapphire rapids support was added in 11
+ ifeq ($(GCCVERSIONGTEQ11), 1)
+  CCOMMON_OPT += -march=sapphirerapids
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=sapphirerapids
+  endif
+ else  # gcc not support, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
+else ifeq ($(C_COMPILER), CLANG)
+ # sapphire rapids support was added in clang 12
+ ifeq ($(CLANGVERSIONGTEQ12), 1)
+  CCOMMON_OPT += -march=sapphirerapids
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=sapphirerapids
+  endif
+ else  # not supported in clang, fallback to avx512
+  CCOMMON_OPT += -march=skylake-avx512
+  ifneq ($(F_COMPILER), NAG)
+   FCOMMON_OPT += -march=skylake-avx512
+  endif
+ endif
 endif
 ifeq ($(OSNAME), CYGWIN_NT)
 CCOMMON_OPT += -fno-asynchronous-unwind-tables
diff --git a/README.md b/README.md
index 6ce85e08e..43f390db0 100644
--- a/README.md
+++ b/README.md
@@ -6,11 +6,15 @@ Travis CI: [![Build Status](https://travis-ci.com/xianyi/OpenBLAS.svg?branch=dev
 
 AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
 
-Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)
+Cirrus CI: [![Build Status](https://api.cirrus-ci.com/github/xianyi/OpenBLAS.svg?branch=develop)](https://cirrus-ci.com/github/xianyi/OpenBLAS)
+<!-- Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)-->
+
 
 [![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
 
+OSUOSL POWERCI [![Build Status](https://powerci.osuosl.org/buildStatus/icon?job=OpenBLAS_gh%2Fdevelop)](http://powerci.osuosl.org/job/OpenBLAS_gh/job/develop/)
 
+OSUOSL IBMZ-CI [![Build Status](http://ibmz-ci.osuosl.org/buildStatus/icon?job=OpenBLAS-Z%2Fdevelop)](http://ibmz-ci.osuosl.org/job/OpenBLAS-Z/job/develop/)
 ## Introduction
 
 OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
@@ -50,10 +54,15 @@ Building OpenBLAS requires the following to be installed:
 
 Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
 To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
-The full target list is in the file `TargetList.txt`. For building with `cmake`, the
-usual conventions apply, i.e. create a build directory either underneath the toplevel
-OpenBLAS source directory or separate from it, and invoke `cmake` there with the path
-to the source tree and any build options you plan to set.
+The full target list is in the file `TargetList.txt`, other build optionss are documented in Makefile.rule and
+can either be set there (typically by removing the comment character from the respective line), or used on the
+`make` command line. 
+Note that when you run `make install` after building, you need to repeat all command line options you provided to `make`
+in the build step, as some settings like the supported maximum number of threads are automatically derived from the
+build host by default, which might not be what you want.
+For building with `cmake`, the usual conventions apply, i.e. create a build directory either underneath the toplevel
+OpenBLAS source directory or separate from it, and invoke `cmake` there with the path to the source tree and any 
+build options you plan to set.
 
 ### Cross compile
 
@@ -113,7 +122,7 @@ Use `PREFIX=` when invoking `make`, for example
 ```sh
 make install PREFIX=your_installation_directory
 ```
-
+(along with all options you added on the `make` command line in the preceding build step)
 The default installation directory is `/opt/OpenBLAS`.
 
 ## Supported CPUs and Operating Systems
@@ -133,7 +142,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
 - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
 - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
-- **AMD ZEN**: Uses Haswell codes with some optimizations.
+- **AMD ZEN**: Uses Haswell codes with some optimizations for Zen 2/3 (use SkylakeX for Zen4)
 
 #### MIPS32
 
@@ -165,13 +174,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **TSV110**: Optimized some Level-3 helper functions
 - **EMAG 8180**: preliminary support based on A57
 - **Neoverse N1**: (AWS Graviton2) preliminary support
-- **Apple Vortex**: preliminary support based on ARMV8
+- **Neoverse V1**: (AWS Graviton3) optimized Level-3 BLAS
+- **Apple Vortex**: preliminary support based on ThunderX2/3
+- **A64FX**:  preliminary support, optimized Level-3 BLAS
+- **ARMV8SVE**: any ARMV8 cpu with SVE extensions 
 
 #### PPC/PPC64
 
 - **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
 - **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. 
-- **POWER10**:
+- **POWER10**: Optimized Level-3 BLAS including SBGEMM and some Level-1,2.
 
 #### IBM zEnterprise System
 
@@ -184,20 +196,37 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
   ```sh
   make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
   ```
-  (also known to work on C906)
+  (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
+
+- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
+  ```sh
+  make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
+  ```
+
+- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
+e.g.:
+  ```sh
+make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
+    BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
+    AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
+    LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
+    HOSTCC=gcc HOSTFC=gfortran -j
+  ```
 
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
 
-For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
+For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX, Cooper Lake, Sapphire Rapids. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
 
 `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
 Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
 
-On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
+On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled.
+
+For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build.
 
-For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
+on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
 
 The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
 common code in the library, usually you will want to set this to the oldest model you expect to encounter.
diff --git a/TargetList.txt b/TargetList.txt
index deef75819..115030c1b 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -118,8 +118,11 @@ Z13
 Z14
 
 10.RISC-V 64:
-RISCV64_GENERIC
+RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
+RISCV64_ZVL128B
 C910V
+x280
+RISCV64_ZVL256B
 
 11.LOONGARCH64:
 LOONGSONGENERIC
@@ -133,3 +136,7 @@ E2K
 EV4
 EV5
 EV6
+
+14.CSKY
+CSKY
+CK860FV
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 16b9da4f5..3ae8615a7 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -115,7 +115,7 @@ jobs:
       mkdir build 
       cd build
       call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
-      cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
+      cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER="flang -I C:\Miniconda\Library\include\flang" -DBUILD_TESTING=OFF -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON ..
       cmake --build . --config Release
       ctest
 
@@ -167,11 +167,10 @@ jobs:
      
 - job: OSX_OpenMP_Clang
   pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-latest'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      LIBRARY_PATH: /usr/local/opt/llvm/lib
-     MACOSX_DEPLOYMENT_TARGET: 11.0
   steps:   
   - script: |
       brew update
@@ -180,7 +179,7 @@ jobs:
 
 - job: OSX_OpenMP_Clang_cmake
   pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-latest'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      LIBRARY_PATH: /usr/local/opt/llvm/lib
@@ -210,7 +209,7 @@ jobs:
 
 - job: OSX_Ifort_Clang
   pool:
-     vmImage: 'macOS-11'
+     vmImage: 'macOS-latest'
   variables:
      LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
      MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
@@ -271,14 +270,27 @@ jobs:
   - script: |
      make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
 
+- job: OSX_xbuild_DYNAMIC_ARM64
+  pool:
+     vmImage: 'macOS-11'
+  variables:
+     CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
+     CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
+  steps:
+  - script: |
+     ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
+     /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
+     /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
+     make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
+
 - job: ALPINE_MUSL
   pool:
      vmImage: 'ubuntu-latest'
   steps:
   - script: |
-        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
-          && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc  alpine-chroot-install' | sha1sum -c \
-          || exit 1
+        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \
+        && echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06  alpine-chroot-install' | sha1sum -c \
+        || exit 1
         alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
         sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
         alpine make DYNAMIC_ARCH=1 BINARY=64
diff --git a/benchmark/Makefile b/benchmark/Makefile
index d9ddb9042..b7493950a 100644
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib
 #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
 LIBESSL = -lesslsmp  $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
 
+# x280 temporary workaround for gfortran
+ifeq ($(TARGET), x280)
+CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT))
+endif
+
+
 ifneq ($(NO_LAPACK), 1)
 GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
 		    scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
@@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME)
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
 
-include $(TOPDIR)/Makefile.tail
+include $(TOPDIR)/Makefile.tail
\ No newline at end of file
diff --git a/benchmark/spr.c b/benchmark/spr.c
old mode 100755
new mode 100644
diff --git a/benchmark/spr2.c b/benchmark/spr2.c
old mode 100755
new mode 100644
diff --git a/benchmark/syrk.c b/benchmark/syrk.c
index fa0f24666..e0ae58707 100644
--- a/benchmark/syrk.c
+++ b/benchmark/syrk.c
@@ -1,5 +1,5 @@
 /***************************************************************************
-Copyright (c) 2014, The OpenBLAS Project
+Copyright (c) 2014, 2023 The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
@@ -67,7 +67,7 @@ int main(int argc, char *argv[]){
   int step =   1;
   int loops =  1;
 
-  if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
+  if ((p = getenv("OPENBLAS_LOOPS"))) loops=atoi(p);
 
   double time1,timeg;
 
@@ -77,7 +77,7 @@ int main(int argc, char *argv[]){
   if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
   if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Uplo = %c Trans = %c Loops = %d\n", from, to, step,uplo,trans,loops);
 
 
   if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
diff --git a/benchmark/trsv.c b/benchmark/trsv.c
index 66ac3a3c7..e17c57157 100644
--- a/benchmark/trsv.c
+++ b/benchmark/trsv.c
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]){
       long long muls = n*(n+1)/2.0;
       long long adds = (n - 1.0)*n/2.0;
 
-      fprintf(stderr, "%10d   %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
+      fprintf(stderr, "%10d :   %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
       if(a != NULL){
         free(a);
       }
diff --git a/c_check b/c_check
index e8f90e18a..59ab9bb13 100755
--- a/c_check
+++ b/c_check
@@ -31,13 +31,17 @@ flags="$*"
 
 cross_suffix=""
 
-if [ "`dirname \"$compiler_name\"`" != '.' ]; then
-    cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
+if [ "`dirname "$compiler_name"`" != '.' ]; then
+    cross_suffix="$cross_suffix`dirname "$compiler_name"`/"
 fi
 
-bn=`basename $compiler_name`
+cn=`echo $compiler_name | sed -e 's/ -.*//'`
+bn=`basename "$cn"`
+
 case "$bn" in
-    *-*) cross_suffix="$cross_suffix${bn%-*}-"
+    *-*) if [ "$bn" != '-' ]; then
+           cross_suffix="$cross_suffix${bn%-*}-"
+         fi
 esac
 
 compiler=""
@@ -87,16 +91,25 @@ case "$data" in
     *ARCH_ZARCH*) architecture=zarch ;;
     *ARCH_RISCV64*) architecture=riscv64 ;;
     *ARCH_LOONGARCH64*) architecture=loongarch64 ;;
+    *ARCH_CSKY*) architecture=csky ;;
 esac
 
 defined=0
 
 if [ "$os" = "AIX" ]; then
-    case "$BINARY" in
-        32) compiler_name="$compiler_name -maix32" ;;
-        64) compiler_name="$compiler_name -maix64" ;;
-    esac
-    defined=1
+    if [ "$compiler" = "GCC" ]; then
+        case "$BINARY" in
+            32) compiler_name="$compiler_name -maix32" ;;
+            64) compiler_name="$compiler_name -maix64" ;;
+        esac
+        defined=1
+    else
+        case "$BINARY" in
+            32) compiler_name="$compiler_name -m32" ;;
+            64) compiler_name="$compiler_name -m64" ;;
+        esac
+        defined=1
+    fi
 fi
 
 case "$architecture" in
@@ -164,7 +177,7 @@ fi
 
 no_msa=0
 if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
-    tmpd="$(mktemp -d)"
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     code='"addvi.b $w0, $w1, 1"'
     msa_flags='-mmsa -mfp64 -mload-store-pairs'
@@ -181,6 +194,35 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
     rm -rf "$tmpd"
 fi
 
+no_lsx=0
+no_lasx=0
+if [ "$architecture" = "loongarch64" ]; then
+    tmpd="$(mktemp -d)"
+    tmplsx="$tmpd/lsx.c"
+    codelsx='"vadd.b $vr0, $vr0, $vr0"'
+    lsx_flags='-march=loongarch64'
+    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
+    args="$lsx_flags -o $tmplsx.o $tmplsx"
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        no_lsx=1
+    }
+
+    tmplasx="$tmpd/lasx.c"
+    codelasx='"xvadd.b $xr0, $xr0, $xr0"'
+    lasx_flags='-march=loongarch64'
+    printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
+    args="$lasx_flags -o $tmplasx.o $tmplasx"
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        no_lasx=1
+    }
+
+    rm -rf "$tmpd"
+fi
+
 case "$data" in
     *ARCH_X86_64*) architecture=x86_64 ;;
     *ARCH_X86*) architecture=x86 ;;
@@ -195,6 +237,7 @@ case "$data" in
     *ARCH_ARM*) architecture=arm ;;
     *ARCH_ZARCH*) architecture=zarch ;;
     *ARCH_LOONGARCH64*) architecture=loongarch64 ;;
+    *ARCH_CSKY*) architecture=csky ;;
 esac
 
 binformat='bin32'
@@ -203,8 +246,9 @@ case "$data" in
 esac
 
 no_avx512=0
+no_avx512bf=0
 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
-    tmpd=`mktemp -d`
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     code='"vbroadcastss -4 * 4(%rsi), %zmm2"'
     printf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@@ -221,11 +265,30 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
     }
 
     rm -rf "$tmpd"
+    if [ "$no_avx512" -eq 0 ]; then
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
+    tmpf="$tmpd/a.c"
+    code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
+    printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
+    if [ "$compiler" = "PGI" ]; then
+        args=" -tp cooperlake -c -o $tmpf.o $tmpf"
+    else
+        args=" -march=cooperlake -c -o $tmpf.o $tmpf"
+    fi
+    no_avx512bf=0
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        no_avx512bf=1
+    }
+
+    rm -rf "$tmpd"
+  fi
 fi
 
 no_rv64gv=0
 if [ "$architecture" = "riscv64" ]; then
-    tmpd=`mktemp -d`
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     code='"vsetvli    zero, zero, e8, m1\n"'
     printf "int main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
@@ -241,13 +304,16 @@ fi
 
 no_sve=0
 if [ "$architecture" = "arm64" ]; then
-    tmpd=`mktemp -d`
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
     tmpf="$tmpd/a.c"
     printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
     args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
     no_sve=0
     {
         $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        args=" -Msve_intrinsics -c -o $tmpf.o $tmpf"
+        $compiler_name $flags $args >/dev/null 2>&1
     } || {
         no_sve=1
     }
@@ -257,7 +323,7 @@ fi
 c11_atomics=0
 case "$data" in
     *HAVE_C11*)
-        tmpd=`mktemp -d`
+        tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
         tmpf="$tmpd/a.c"
         printf "#include <stdatomic.h>\nint main(void){}\n" >> "$tmpf"
         args=" -c -o $tmpf.o $tmpf"
@@ -365,6 +431,7 @@ done
  [ "$makefile" = "-" ] && {
     [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
     [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+    [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
     [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
     [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
     exit 0
@@ -393,8 +460,11 @@ done
     [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
     [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
     [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+    [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
     [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
     [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
+    [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
+    [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
 } >> "$makefile"
 
 os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
@@ -410,6 +480,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
     [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
     [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
     [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
+    [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
+    [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
 } >> "$config"
 
 
diff --git a/c_check.pl b/c_check.pl
index 6ce28e11b..6b89f06eb 100644
--- a/c_check.pl
+++ b/c_check.pl
@@ -97,6 +97,7 @@ $architecture = arm64        if ($data =~ /ARCH_ARM64/);
 $architecture = zarch        if ($data =~ /ARCH_ZARCH/);
 $architecture = riscv64      if ($data =~ /ARCH_RISCV64/);
 $architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
+$architecture = csky         if ($data =~ /ARCH_CSKY/);
 
 $defined = 0;
 
@@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") {
     $binary = 64;
 }
 
+if ($architecture eq "csky") {
+    $defined = 1;
+    $binary = 32;
+}
+
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");
@@ -232,6 +238,45 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
     }
 }
 
+$no_lsx = 0;
+$no_lasx = 0;
+if (($architecture eq "loongarch64")) {
+    eval "use File::Temp qw(tempfile)";
+    if ($@){
+	warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
+    } else {
+	$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
+	$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
+	$lsx_flags = "-march=loongarch64";
+	print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
+
+	$args = "$lsx_flags -o $tmplsx.o $tmplsx";
+	my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
+	system(@cmd) == 0;
+	if ($? != 0) {
+	    $no_lsx = 1;
+	} else {
+	    $no_lsx = 0;
+	}
+	unlink("$tmplsx.o");
+
+	$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
+	$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
+	$lasx_flags = "-march=loongarch64";
+	print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
+
+	$args = "$lasx_flags -o $tmplasx.o $tmplasx";
+	my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
+	system(@cmd) == 0;
+	if ($? != 0) {
+	    $no_lasx = 1;
+	} else {
+	    $no_lasx = 0;
+	}
+	unlink("$tmplasx.o");
+    }
+}
+
 $architecture = x86          if ($data =~ /ARCH_X86/);
 $architecture = x86_64       if ($data =~ /ARCH_X86_64/);
 $architecture = e2k          if ($data =~ /ARCH_E2K/);
@@ -245,6 +290,7 @@ $architecture = arm          if ($data =~ /ARCH_ARM/);
 $architecture = arm64        if ($data =~ /ARCH_ARM64/);
 $architecture = zarch        if ($data =~ /ARCH_ZARCH/);
 $architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
+$architecture = csky         if ($data =~ /ARCH_CSKY/);
 
 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
@@ -424,6 +470,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
 print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
 print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
 print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
+print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
+print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
 
 $os           =~ tr/[a-z]/[A-Z]/;
 $architecture =~ tr/[a-z]/[A-Z]/;
@@ -437,6 +485,8 @@ print CONFFILE "#define __64BIT__\t1\n"  if $binformat eq bin64;
 print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
 print CONFFILE "#define HAVE_MSA\t1\n"  if $have_msa eq 1;
 print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
+print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
+print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
 
 
 if ($os eq "LINUX") {
diff --git a/cblas.h b/cblas.h
index c2bdd27fa..beaa32cc2 100644
--- a/cblas.h
+++ b/cblas.h
@@ -12,6 +12,7 @@ extern "C" {
 /*Set the number of threads on runtime.*/
 void openblas_set_num_threads(int num_threads);
 void goto_set_num_threads(int num_threads);
+int openblas_set_num_threads_local(int num_threads);
 
 /*Get the number of threads on runtime.*/
 int openblas_get_num_threads(void);
@@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
 CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 
+float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
+float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
 CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
@@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 
+void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+
 void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
@@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
 void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 
+void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
+void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 
 void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                  OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
@@ -350,7 +372,7 @@ void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBL
 void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                   OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
 
-void cblas_xerbla(blasint p, char *rout, char *form, ...);
+void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);
 
 /*** BLAS extensions ***/
 
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 8521f3988..eb974456b 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -44,9 +44,9 @@ endif ()
 
 if (DYNAMIC_ARCH)
   if (ARM64)
-	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
+	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
     if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
-          set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
+          set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
     endif ()
     if (DYNAMIC_LIST)
 	  set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
@@ -82,7 +82,7 @@ if (DYNAMIC_ARCH)
       set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
     endif ()
     if (NOT NO_AVX512)
-      set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
+      set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS)
       string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
     endif ()
     if (DYNAMIC_LIST)
@@ -135,7 +135,7 @@ if (ARM64)
   set(BINARY_DEFINED 1)
 endif ()
 
-if (${ARCH} STREQUAL "riscv64")
+if (RISCV64)
   set(NO_BINARY_MODE 1)
   set(BINARY_DEFINED 1)
 endif ()
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 83b8d15ab..2da941afb 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS
 
     if (LOONGARCH64)
       if (BINARY64)
-        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+        if(COMPILER_SUPPORT_LP64D_ABI)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d")
+	else()
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+	endif ()
       else ()
-        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	if(COMPILER_SUPPORT_ILP32D_ABI)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d")
+	else()
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+	endif ()
       endif ()
       set(BINARY_DEFINED 1)
     endif ()
@@ -65,6 +75,14 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
   endif ()
 endif ()
 
+if (${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC")
+  if (POWER)
+    set(CCOMMON_OPT "${CCOMMON_OPT} -tp pwr8")
+  else ()
+    set(CCOMMON_OPT "${CCOMMON_OPT} -tp px")
+  endif ()
+endif ()
+
 if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
   if (BINARY64)
     set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
@@ -172,22 +190,30 @@ endif ()
 
 if (${CORE} STREQUAL NEOVERSEN2)
   if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
-      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
     else ()
-      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
-    endif()
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+      else ()
+	set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+      endif()
+    endif ()
   endif ()
 endif ()
 
 if (${CORE} STREQUAL NEOVERSEV1)
   if (NOT DYNAMIC_ARCH)
-    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
-      set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
     else ()
-      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+        set (CCOMMON_OPT  "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
+      else ()
+        set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
+      endif()
     endif()
   endif ()
 endif ()
@@ -205,7 +231,11 @@ endif ()
 
 if (${CORE} STREQUAL ARMV8SVE)
   if (NOT DYNAMIC_ARCH)
-    set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
+    else ()
+      set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
+    endif ()
   endif ()
 endif ()
 
@@ -262,6 +292,27 @@ if (${CORE} STREQUAL POWER8)
   endif ()
 endif ()
 
+# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone
+# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case.
+if (${CORE} STREQUAL PPC970)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math")
+  endif ()
+  if (APPLE)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")
+  endif ()
+endif ()
+
+# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency?
+if (${CORE} STREQUAL PPCG4)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math")
+  endif ()
+  if (APPLE)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")
+  endif ()
+endif ()
+
 if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX2)
         set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx2")
diff --git a/cmake/f_check.cmake b/cmake/f_check.cmake
index df3a4858d..4c4f5ac04 100644
--- a/cmake/f_check.cmake
+++ b/cmake/f_check.cmake
@@ -64,6 +64,7 @@ else ()
     "#define NEEDBUNDERSCORE 1\n")
 endif()
 
+if (CMAKE_Fortran_COMPILER)
 get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
 string(TOUPPER ${F_COMPILER} F_COMPILER)
-
+endif()
diff --git a/cmake/fc.cmake b/cmake/fc.cmake
index e615e148e..bc85a2921 100644
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@@ -3,11 +3,9 @@
 ## Description: Ported from portion of OpenBLAS/Makefile.system
 ##              Sets Fortran related variables.
 
-if (${F_COMPILER} STREQUAL "FLANG")
+if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+  # This is for classic Flang. LLVM Flang is handled with gfortran below.
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
-  if (BINARY64 AND INTERFACE64)
-    set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
-  endif ()
   if (USE_OPENMP)
     set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
   endif ()
@@ -38,31 +36,56 @@ if (${F_COMPILER} STREQUAL "G95")
   endif ()
 endif ()
 
-if (${F_COMPILER} STREQUAL "GFORTRAN")
+if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
-  # ensure reentrancy of lapack codes
-  set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
-  # work around ABI violation in passing string arguments from C
-  set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
-  #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
-  if (NOT NO_LAPACK)
-    set(EXTRALIB "${EXTRALIB} -lgfortran")
+  if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+    # ensure reentrancy of lapack codes
+    set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
+    # work around ABI violation in passing string arguments from C
+    set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
+    if (NOT NO_LAPACK)
+      # Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
+      set(EXTRALIB "${EXTRALIB} -lgfortran")
+    endif ()
   endif ()
   if (NO_BINARY_MODE)
     if (MIPS64)
       if (BINARY64)
         set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
+        if (INTERFACE64)
+          set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
+	endif ()
       else ()
         set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
       endif ()
     endif ()
     if (LOONGARCH64)
       if (BINARY64)
-        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+        if(COMPILER_SUPPORT_LP64D_ABI)
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
+	else()
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+	endif ()
       else ()
-        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	if(COMPILER_SUPPORT_ILP32D_ABI)
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
+	else()
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+	endif ()
+      endif ()
+    endif ()
+    if (RISCV64)
+      if (BINARY64)
+        if (INTERFACE64)
+          set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
+        endif ()
       endif ()
     endif ()
+    if (ARM64 AND INTERFACE64)
+          set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
+    endif ()
   else ()
     if (BINARY64)
       set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
@@ -121,7 +144,7 @@ if (${F_COMPILER} STREQUAL "IBM")
   endif ()
 endif ()
 
-if (${F_COMPILER} STREQUAL "PGI")
+if (${F_COMPILER} STREQUAL "PGI" OR ${F_COMPILER} STREQUAL "PGF95")
   set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_PGI")
   set(COMMON_PROF "${COMMON_PROF} -DPGICOMPILER")
   if (BINARY64)
diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index 45dda8686..003a8b3c1 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -52,7 +52,7 @@ set(SLASRC
    sgebrd.f sgecon.f sgeequ.f sgees.f  sgeesx.f sgeev.f  sgeevx.f
    sgehd2.f sgehrd.f sgelq2.f sgelqf.f
    sgels.f  sgelsd.f sgelss.f sgelsy.f sgeql2.f sgeqlf.f
-   sgeqp3.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
+   sgeqp3.f sgeqp3rk.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f sgerq2.f sgerqf.f
    sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f sgetc2.f
    sgetrf2.f sgetri.f
    sggbak.f sggbal.f
@@ -67,7 +67,7 @@ set(SLASRC
    slangb.f slange.f slangt.f slanhs.f slansb.f slansp.f
    slansy.f slantb.f slantp.f slantr.f slanv2.f
    slapll.f slapmt.f
-   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
+   slaqgb.f slaqge.f slaqp2.f slaqps.f slaqp2rk.f slaqp3rk.f slaqsb.f slaqsp.f slaqsy.f
    slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
    slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
    slarf.f  slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
@@ -124,7 +124,7 @@ set(SLASRC
    ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
    sgesvdq.f slaorhr_col_getrfnp.f
    slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f 
-   slatrs3.f strsyl3.f sgelst.f)
+   slatrs3.f strsyl3.f sgelst.f sgedmd.f90 sgedmdq.f90)
 
 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
    sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -139,7 +139,7 @@ set(CLASRC
    cgbtf2.f cgbtrf.f cgbtrs.f cgebak.f cgebal.f cgebd2.f cgebrd.f
    cgecon.f cgeequ.f cgees.f  cgeesx.f cgeev.f  cgeevx.f
    cgehd2.f cgehrd.f cgelq2.f cgelqf.f
-   cgels.f  cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f
+   cgels.f  cgelsd.f cgelss.f cgelsy.f cgeql2.f cgeqlf.f cgeqp3.f cgeqp3rk.f
    cgeqr2.f cgeqr2p.f cgeqrf.f cgeqrfp.f cgerfs.f cgerq2.f cgerqf.f
    cgesc2.f cgesdd.f cgesvd.f cgesvdx.f
    cgesvj.f cgejsv.f cgsvj0.f cgsvj1.f
@@ -173,7 +173,7 @@ set(CLASRC
    clanhb.f clanhe.f
    clanhp.f clanhs.f clanht.f clansb.f clansp.f clansy.f clantb.f
    clantp.f clantr.f clapll.f clapmt.f clarcm.f claqgb.f claqge.f
-   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
+   claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqp2rk.f claqp3rk.f claqsb.f
    claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
    claqz0.f claqz1.f claqz2.f claqz3.f
    claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
@@ -187,7 +187,7 @@ set(CLASRC
    cposv.f  cposvx.f cpotrf2.f cpotri.f cpstrf.f cpstf2.f
    cppcon.f cppequ.f cpprfs.f cppsv.f  cppsvx.f cpptrf.f cpptri.f cpptrs.f
    cptcon.f cpteqr.f cptrfs.f cptsv.f  cptsvx.f cpttrf.f cpttrs.f cptts2.f
-   crot.f   cspcon.f csprfs.f cspsv.f
+   crot.f crscl.f cspcon.f csprfs.f cspsv.f
    cspsvx.f csptrf.f csptri.f csptrs.f csrscl.f cstedc.f
    cstegr.f cstein.f csteqr.f csycon.f
    csyrfs.f csysv.f  csysvx.f csytf2.f csytrf.f csytri.f
@@ -223,7 +223,7 @@ set(CLASRC
    chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
    cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
    cungtsqr.f cungtsqr_row.f cunhr_col.f 
-   clatrs3.f ctrsyl3.f cgelst.f)
+   clatrs3.f ctrsyl3.f cgelst.f cgedmd.f90 cgedmdq.f90)
 
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
    cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -243,7 +243,7 @@ set(DLASRC
    dgebrd.f dgecon.f dgeequ.f dgees.f  dgeesx.f dgeev.f  dgeevx.f
    dgehd2.f dgehrd.f dgelq2.f dgelqf.f
    dgels.f  dgelsd.f dgelss.f dgelsy.f dgeql2.f dgeqlf.f
-   dgeqp3.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
+   dgeqp3.f dgeqp3rk.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f dgerq2.f dgerqf.f
    dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f dgetc2.f
    dgetrf2.f dgetri.f
    dggbak.f dggbal.f
@@ -258,7 +258,7 @@ set(DLASRC
    dlangb.f dlange.f dlangt.f dlanhs.f dlansb.f dlansp.f
    dlansy.f dlantb.f dlantp.f dlantr.f dlanv2.f
    dlapll.f dlapmt.f
-   dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
+   dlaqgb.f dlaqge.f dlaqp2.f dlaqp2rk.f dlaqp3rk.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
    dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
    dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
    dlarf.f  dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
@@ -316,7 +316,7 @@ set(DLASRC
    dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
    dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
    dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f 
-   dlatrs3.f dtrsyl3.f dgelst.f)
+   dlatrs3.f dtrsyl3.f dgelst.f dgedmd.f90 dgedmdq.f90)
 
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
    dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -331,7 +331,7 @@ set(ZLASRC
    zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
    zgecon.f zgeequ.f zgees.f  zgeesx.f zgeev.f  zgeevx.f
    zgehd2.f zgehrd.f zgelq2.f zgelqf.f
-   zgels.f  zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
+   zgels.f  zgelsd.f zgelss.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f zgeqp3rk.f
    zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
    zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvx.f
    zgesvj.f zgejsv.f zgsvj0.f zgsvj1.f
@@ -367,7 +367,7 @@ set(ZLASRC
    zlanhe.f
    zlanhp.f zlanhs.f zlanht.f zlansb.f zlansp.f zlansy.f zlantb.f
    zlantp.f zlantr.f zlapll.f zlapmt.f zlaqgb.f zlaqge.f
-   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
+   zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqp2rk.f zlaqp3rk.f zlaqps.f zlaqsb.f
    zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
    zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
    zlarcm.f zlarf.f  zlarfb.f zlarfb_gett.f
@@ -381,7 +381,7 @@ set(ZLASRC
    zposv.f  zposvx.f zpotrf2.f zpotri.f zpotrs.f zpstrf.f zpstf2.f
    zppcon.f zppequ.f zpprfs.f zppsv.f  zppsvx.f zpptrf.f zpptri.f zpptrs.f
    zptcon.f zpteqr.f zptrfs.f zptsv.f  zptsvx.f zpttrf.f zpttrs.f zptts2.f
-   zrot.f   zspcon.f zsprfs.f zspsv.f
+   zrot.f zrscl.f zspcon.f zsprfs.f zspsv.f
    zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
    zstegr.f zstein.f zsteqr.f zsycon.f
    zsyrfs.f zsysv.f  zsysvx.f zsytf2.f zsytrf.f zsytri.f
@@ -419,7 +419,7 @@ set(ZLASRC
    zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
    zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
    zungtsqr.f zungtsqr_row.f zunhr_col.f
-   zlatrs3.f ztrsyl3.f zgelst.f)
+   zlatrs3.f ztrsyl3.f zgelst.f zgedmd.f90 zgedmdq.f90)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
@@ -436,19 +436,25 @@ if(USE_XBLAS)
   set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
 endif()
 
+if(BUILD_LAPACK_DEPRECATED)
 list(APPEND SLASRC DEPRECATED/sgegs.f DEPRECATED/sgegv.f
+  DEPRECATED/sgelqs.f DEPRECATED/sgeqrs.f
   DEPRECATED/sgeqpf.f DEPRECATED/sgelsx.f DEPRECATED/sggsvd.f
   DEPRECATED/sggsvp.f DEPRECATED/slahrd.f DEPRECATED/slatzm.f DEPRECATED/stzrqf.f)
 list(APPEND DLASRC DEPRECATED/dgegs.f DEPRECATED/dgegv.f
+  DEPRECATED/dgelqs.f DEPRECATED/dgeqrs.f
   DEPRECATED/dgeqpf.f DEPRECATED/dgelsx.f DEPRECATED/dggsvd.f
   DEPRECATED/dggsvp.f DEPRECATED/dlahrd.f DEPRECATED/dlatzm.f DEPRECATED/dtzrqf.f)
 list(APPEND CLASRC DEPRECATED/cgegs.f DEPRECATED/cgegv.f
+  DEPRECATED/cgelqs.f DEPRECATED/cgeqrs.f
   DEPRECATED/cgeqpf.f DEPRECATED/cgelsx.f DEPRECATED/cggsvd.f
   DEPRECATED/cggsvp.f DEPRECATED/clahrd.f DEPRECATED/clatzm.f DEPRECATED/ctzrqf.f)
 list(APPEND ZLASRC DEPRECATED/zgegs.f DEPRECATED/zgegv.f
+  DEPRECATED/zgelqs.f DEPRECATED/zgeqrs.f
   DEPRECATED/zgeqpf.f DEPRECATED/zgelsx.f DEPRECATED/zggsvd.f
   DEPRECATED/zggsvp.f DEPRECATED/zlahrd.f DEPRECATED/zlatzm.f DEPRECATED/ztzrqf.f)
 message(STATUS "Building deprecated routines")
+endif()
 
 set(DSLASRC spotrs.f)
 
@@ -551,7 +557,7 @@ set(SLASRC
    sgebrd.c sgecon.c sgeequ.c sgees.c  sgeesx.c sgeev.c  sgeevx.c
    sgehd2.c sgehrd.c sgelq2.c sgelqf.c
    sgels.c  sgelsd.c sgelss.c sgelsy.c sgeql2.c sgeqlf.c
-   sgeqp3.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
+   sgeqp3.c sgeqp3rk.c sgeqr2.c sgeqr2p.c sgeqrf.c sgeqrfp.c sgerfs.c sgerq2.c sgerqf.c
    sgesc2.c sgesdd.c sgesvd.c sgesvdx.c sgesvx.c sgetc2.c
    sgetrf2.c sgetri.c
    sggbak.c sggbal.c
@@ -565,7 +571,7 @@ set(SLASRC
    slangb.c slange.c slangt.c slanhs.c slansb.c slansp.c
    slansy.c slantb.c slantp.c slantr.c slanv2.c
    slapll.c slapmt.c
-   slaqgb.c slaqge.c slaqp2.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
+   slaqgb.c slaqge.c slaqp2.c slaqp2rk.c slaqp3rk.c slaqps.c slaqsb.c slaqsp.c slaqsy.c
    slaqr0.c slaqr1.c slaqr2.c slaqr3.c slaqr4.c slaqr5.c
    slaqtr.c slar1v.c slar2v.c ilaslr.c ilaslc.c
    slarf.c  slarfb.c slarfb_gett.c slarfg.c slarfgp.c slarft.c slarfx.c slarfy.c slargv.c
@@ -622,7 +628,7 @@ set(SLASRC
    ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
    sgesvdq.c slaorhr_col_getrfnp.c
    slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c 
-   slatrs3.c strsyl3.c sgelst.c)
+   slatrs3.c strsyl3.c sgelst.c sgedmd.c sgedmdq.c)
 
 set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
    sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
@@ -637,7 +643,7 @@ set(CLASRC
    cgbtf2.c cgbtrf.c cgbtrs.c cgebak.c cgebal.c cgebd2.c cgebrd.c
    cgecon.c cgeequ.c cgees.c  cgeesx.c cgeev.c  cgeevx.c
    cgehd2.c cgehrd.c cgelq2.c cgelqf.c
-   cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c
+   cgels.c  cgelsd.c cgelss.c cgelsy.c cgeql2.c cgeqlf.c cgeqp3.c cgeqp3rk.c
    cgeqr2.c cgeqr2p.c cgeqrf.c cgeqrfp.c cgerfs.c cgerq2.c cgerqf.c
    cgesc2.c cgesdd.c cgesvd.c cgesvdx.c
    cgesvj.c cgejsv.c cgsvj0.c cgsvj1.c
@@ -671,7 +677,7 @@ set(CLASRC
    clanhb.c clanhe.c
    clanhp.c clanhs.c clanht.c clansb.c clansp.c clansy.c clantb.c
    clantp.c clantr.c clapll.c clapmt.c clarcm.c claqgb.c claqge.c
-   claqhb.c claqhe.c claqhp.c claqp2.c claqps.c claqsb.c
+   claqhb.c claqhe.c claqhp.c claqp2.c claqp2rk.c claqp3rk.c claqps.c claqsb.c
    claqr0.c claqr1.c claqr2.c claqr3.c claqr4.c claqr5.c
    claqsp.c claqsy.c clar1v.c clar2v.c ilaclr.c ilaclc.c
    clarf.c  clarfb.c clarfb_gett.c clarfg.c clarfgp.c clarft.c
@@ -684,7 +690,7 @@ set(CLASRC
    cposv.c  cposvx.c cpotrf2.c cpotri.c cpstrf.c cpstf2.c
    cppcon.c cppequ.c cpprfs.c cppsv.c  cppsvx.c cpptrf.c cpptri.c cpptrs.c
    cptcon.c cpteqr.c cptrfs.c cptsv.c  cptsvx.c cpttrf.c cpttrs.c cptts2.c
-   crot.c   cspcon.c csprfs.c cspsv.c
+   crot.c crscl.c  cspcon.c csprfs.c cspsv.c
    cspsvx.c csptrf.c csptri.c csptrs.c csrscl.c cstedc.c
    cstegr.c cstein.c csteqr.c csycon.c
    csyrfs.c csysv.c  csysvx.c csytf2.c csytrf.c csytri.c
@@ -720,7 +726,7 @@ set(CLASRC
    chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
    cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c 
    cungtsqr.c cungtsqr_row.c cunhr_col.c 
-   clatrs3.c ctrsyl3.c cgelst.c)
+   clatrs3.c ctrsyl3.c cgelst.c cgedmd.c cgedmdq.c)
 
 set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
    cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
@@ -740,7 +746,7 @@ set(DLASRC
    dgebrd.c dgecon.c dgeequ.c dgees.c  dgeesx.c dgeev.c  dgeevx.c
    dgehd2.c dgehrd.c dgelq2.c dgelqf.c
    dgels.c  dgelsd.c dgelss.c dgelsy.c dgeql2.c dgeqlf.c
-   dgeqp3.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
+   dgeqp3.c dgeqp3rk.c dgeqr2.c dgeqr2p.c dgeqrf.c dgeqrfp.c dgerfs.c dgerq2.c dgerqf.c
    dgesc2.c dgesdd.c dgesvd.c dgesvdx.c dgesvx.c dgetc2.c
    dgetrf2.c dgetri.c
    dggbak.c dggbal.c
@@ -754,7 +760,7 @@ set(DLASRC
    dlangb.c dlange.c dlangt.c dlanhs.c dlansb.c dlansp.c
    dlansy.c dlantb.c dlantp.c dlantr.c dlanv2.c
    dlapll.c dlapmt.c
-   dlaqgb.c dlaqge.c dlaqp2.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
+   dlaqgb.c dlaqge.c dlaqp2.c dlaqp2rk.c dlaqp3rk.c dlaqps.c dlaqsb.c dlaqsp.c dlaqsy.c
    dlaqr0.c dlaqr1.c dlaqr2.c dlaqr3.c dlaqr4.c dlaqr5.c
    dlaqtr.c dlar1v.c dlar2v.c iladlr.c iladlc.c
    dlarf.c  dlarfb.c dlarfb_gett.c dlarfg.c dlarfgp.c dlarft.c dlarfx.c dlarfy.c
@@ -812,7 +818,7 @@ set(DLASRC
    dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
    dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
    dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c 
-   dlatrs3.c dtrsyl3.c dgelst.c)
+   dlatrs3.c dtrsyl3.c dgelst.c dgedmd.c dgedmdq.c)
 
 set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
    dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
@@ -827,7 +833,7 @@ set(ZLASRC
    zgbtf2.c zgbtrf.c zgbtrs.c zgebak.c zgebal.c zgebd2.c zgebrd.c
    zgecon.c zgeequ.c zgees.c  zgeesx.c zgeev.c  zgeevx.c
    zgehd2.c zgehrd.c zgelq2.c zgelqf.c
-   zgels.c  zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c
+   zgels.c  zgelsd.c zgelss.c zgelsy.c zgeql2.c zgeqlf.c zgeqp3.c zgeqp3rk.c
    zgeqr2.c zgeqr2p.c zgeqrf.c zgeqrfp.c zgerfs.c zgerq2.c zgerqf.c
    zgesc2.c zgesdd.c zgesvd.c zgesvdx.c zgesvx.c
    zgesvj.c zgejsv.c zgsvj0.c zgsvj1.c
@@ -862,7 +868,7 @@ set(ZLASRC
    zlanhe.c
    zlanhp.c zlanhs.c zlanht.c zlansb.c zlansp.c zlansy.c zlantb.c
    zlantp.c zlantr.c zlapll.c zlapmt.c zlaqgb.c zlaqge.c
-   zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqps.c zlaqsb.c
+   zlaqhb.c zlaqhe.c zlaqhp.c zlaqp2.c zlaqp2rk.c zlaqp3rk.c zlaqps.c zlaqsb.c
    zlaqr0.c zlaqr1.c zlaqr2.c zlaqr3.c zlaqr4.c zlaqr5.c
    zlaqsp.c zlaqsy.c zlar1v.c zlar2v.c ilazlr.c ilazlc.c
    zlarcm.c zlarf.c  zlarfb.c zlarfb_gett.c
@@ -876,7 +882,7 @@ set(ZLASRC
    zposv.c  zposvx.c zpotrf2.c zpotri.c zpotrs.c zpstrf.c zpstf2.c
    zppcon.c zppequ.c zpprfs.c zppsv.c  zppsvx.c zpptrf.c zpptri.c zpptrs.c
    zptcon.c zpteqr.c zptrfs.c zptsv.c  zptsvx.c zpttrf.c zpttrs.c zptts2.c
-   zrot.c   zspcon.c zsprfs.c zspsv.c
+   zrot.c zrscl.c  zspcon.c zsprfs.c zspsv.c
    zspsvx.c zsptrf.c zsptri.c zsptrs.c zdrscl.c zstedc.c
    zstegr.c zstein.c zsteqr.c zsycon.c
    zsyrfs.c zsysv.c  zsysvx.c zsytf2.c zsytrf.c zsytri.c
@@ -913,7 +919,8 @@ set(ZLASRC
    zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
    zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
    zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
-   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
+   zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c
+   zgedmd.c zgedmdq.c)
 
 set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
    zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
@@ -930,19 +937,25 @@ if(USE_XBLAS)
   set(ALLXOBJ ${SXLASRC} ${DXLASRC} ${CXLASRC} ${ZXLASRC})
 endif()
 
+if(BUILD_LAPACK_DEPRECATED)
 list(APPEND SLASRC DEPRECATED/sgegs.c DEPRECATED/sgegv.c
+  DEPRECATED/sgelqs.c DEPRECATED/sgeqrs.c
   DEPRECATED/sgeqpf.c DEPRECATED/sgelsx.c DEPRECATED/sggsvd.c
   DEPRECATED/sggsvp.c DEPRECATED/slahrd.c DEPRECATED/slatzm.c DEPRECATED/stzrqf.c)
 list(APPEND DLASRC DEPRECATED/dgegs.c DEPRECATED/dgegv.c
+  DEPRECATED/dgelqs.c DEPRECATED/dgeqrs.c
   DEPRECATED/dgeqpf.c DEPRECATED/dgelsx.c DEPRECATED/dggsvd.c
   DEPRECATED/dggsvp.c DEPRECATED/dlahrd.c DEPRECATED/dlatzm.c DEPRECATED/dtzrqf.c)
 list(APPEND CLASRC DEPRECATED/cgegs.c DEPRECATED/cgegv.c
+  DEPRECATED/cgelqs.c DEPRECATED/cgeqrs.c
   DEPRECATED/cgeqpf.c DEPRECATED/cgelsx.c DEPRECATED/cggsvd.c
   DEPRECATED/cggsvp.c DEPRECATED/clahrd.c DEPRECATED/clatzm.c DEPRECATED/ctzrqf.c)
 list(APPEND ZLASRC DEPRECATED/zgegs.c DEPRECATED/zgegv.c
+  DEPRECATED/zgelqs.c DEPRECATED/zgeqrs.c
   DEPRECATED/zgeqpf.c DEPRECATED/zgelsx.c DEPRECATED/zggsvd.c
   DEPRECATED/zggsvp.c DEPRECATED/zlahrd.c DEPRECATED/zlatzm.c DEPRECATED/ztzrqf.c)
 message(STATUS "Building deprecated routines")
+endif()
 
 set(DSLASRC spotrs.c)
 
diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index 3a9352197..f43bf10d0 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -70,8 +70,6 @@ set(CSRC
   lapacke_cgeqlf_work.c
   lapacke_cgeqp3.c
   lapacke_cgeqp3_work.c
-  lapacke_cgeqpf.c
-  lapacke_cgeqpf_work.c
   lapacke_cgeqr.c
   lapacke_cgeqr_work.c
   lapacke_cgeqr2.c
@@ -92,6 +90,10 @@ set(CSRC
   lapacke_cgerqf_work.c
   lapacke_cgesdd.c
   lapacke_cgesdd_work.c
+  lapacke_cgedmd.c
+  lapacke_cgedmd_work.c
+  lapacke_cgedmdq.c
+  lapacke_cgedmdq_work.c
   lapacke_cgesv.c
   lapacke_cgesv_work.c
   lapacke_cgesvd.c
@@ -144,12 +146,8 @@ set(CSRC
   lapacke_cggqrf_work.c
   lapacke_cggrqf.c
   lapacke_cggrqf_work.c
-  lapacke_cggsvd.c
-  lapacke_cggsvd_work.c
   lapacke_cggsvd3.c
   lapacke_cggsvd3_work.c
-  lapacke_cggsvp.c
-  lapacke_cggsvp_work.c
   lapacke_cggsvp3.c
   lapacke_cggsvp3_work.c
   lapacke_cgtcon.c
@@ -564,6 +562,8 @@ set(CSRC
   lapacke_ctrsna_work.c
   lapacke_ctrsyl.c
   lapacke_ctrsyl_work.c
+  lapacke_ctrsyl3.c
+  lapacke_ctrsyl3_work.c
   lapacke_ctrtri.c
   lapacke_ctrtri_work.c
   lapacke_ctrtrs.c
@@ -596,6 +596,8 @@ set(CSRC
   lapacke_cungtr_work.c
   lapacke_cungtsqr_row.c
   lapacke_cungtsqr_row_work.c
+  lapacke_cunhr_col.c
+  lapacke_cunhr_col_work.c
   lapacke_cunmbr.c
   lapacke_cunmbr_work.c
   lapacke_cunmhr.c
@@ -695,8 +697,6 @@ set(DSRC
   lapacke_dgeqlf_work.c
   lapacke_dgeqp3.c
   lapacke_dgeqp3_work.c
-  lapacke_dgeqpf.c
-  lapacke_dgeqpf_work.c
   lapacke_dgeqr.c
   lapacke_dgeqr_work.c
   lapacke_dgeqr2.c
@@ -717,6 +717,10 @@ set(DSRC
   lapacke_dgerqf_work.c
   lapacke_dgesdd.c
   lapacke_dgesdd_work.c
+  lapacke_dgedmd.c
+  lapacke_dgedmd_work.c
+  lapacke_dgedmdq.c
+  lapacke_dgedmdq_work.c
   lapacke_dgesv.c
   lapacke_dgesv_work.c
   lapacke_dgesvd.c
@@ -771,12 +775,8 @@ set(DSRC
   lapacke_dggqrf_work.c
   lapacke_dggrqf.c
   lapacke_dggrqf_work.c
-  lapacke_dggsvd.c
-  lapacke_dggsvd_work.c
   lapacke_dggsvd3.c
   lapacke_dggsvd3_work.c
-  lapacke_dggsvp.c
-  lapacke_dggsvp_work.c
   lapacke_dggsvp3.c
   lapacke_dggsvp3_work.c
   lapacke_dgtcon.c
@@ -874,6 +874,8 @@ set(DSRC
   lapacke_dorgtr_work.c
   lapacke_dorgtsqr_row.c
   lapacke_dorgtsqr_row_work.c
+  lapacke_dorhr_col.c
+  lapacke_dorhr_col_work.c
   lapacke_dormbr.c
   lapacke_dormbr_work.c
   lapacke_dormhr.c
@@ -1186,6 +1188,8 @@ set(DSRC
   lapacke_dtrsna_work.c
   lapacke_dtrsyl.c
   lapacke_dtrsyl_work.c
+  lapacke_dtrsyl3.c
+  lapacke_dtrsyl3_work.c
   lapacke_dtrtri.c
   lapacke_dtrtri_work.c
   lapacke_dtrtrs.c
@@ -1275,8 +1279,6 @@ set(SSRC
   lapacke_sgeqlf_work.c
   lapacke_sgeqp3.c
   lapacke_sgeqp3_work.c
-  lapacke_sgeqpf.c
-  lapacke_sgeqpf_work.c
   lapacke_sgeqr.c
   lapacke_sgeqr_work.c
   lapacke_sgeqr2.c
@@ -1297,6 +1299,10 @@ set(SSRC
   lapacke_sgerqf_work.c
   lapacke_sgesdd.c
   lapacke_sgesdd_work.c
+  lapacke_sgedmd.c
+  lapacke_sgedmd_work.c
+  lapacke_sgedmdq.c
+  lapacke_sgedmdq_work.c
   lapacke_sgesv.c
   lapacke_sgesv_work.c
   lapacke_sgesvd.c
@@ -1351,12 +1357,8 @@ set(SSRC
   lapacke_sggqrf_work.c
   lapacke_sggrqf.c
   lapacke_sggrqf_work.c
-  lapacke_sggsvd.c
-  lapacke_sggsvd_work.c
   lapacke_sggsvd3.c
   lapacke_sggsvd3_work.c
-  lapacke_sggsvp.c
-  lapacke_sggsvp_work.c
   lapacke_sggsvp3.c
   lapacke_sggsvp3_work.c
   lapacke_sgtcon.c
@@ -1453,6 +1455,8 @@ set(SSRC
   lapacke_sorgtr_work.c
   lapacke_sorgtsqr_row.c
   lapacke_sorgtsqr_row_work.c
+  lapacke_sorhr_col.c
+  lapacke_sorhr_col_work.c
   lapacke_sormbr.c
   lapacke_sormbr_work.c
   lapacke_sormhr.c
@@ -1762,6 +1766,8 @@ set(SSRC
   lapacke_strsna_work.c
   lapacke_strsyl.c
   lapacke_strsyl_work.c
+  lapacke_ctrsyl3.c
+  lapacke_ctrsyl3_work.c
   lapacke_strtri.c
   lapacke_strtri_work.c
   lapacke_strtrs.c
@@ -1849,8 +1855,6 @@ set(ZSRC
   lapacke_zgeqlf_work.c
   lapacke_zgeqp3.c
   lapacke_zgeqp3_work.c
-  lapacke_zgeqpf.c
-  lapacke_zgeqpf_work.c
   lapacke_zgeqr.c
   lapacke_zgeqr_work.c
   lapacke_zgeqr2.c
@@ -1871,6 +1875,10 @@ set(ZSRC
   lapacke_zgerqf_work.c
   lapacke_zgesdd.c
   lapacke_zgesdd_work.c
+  lapacke_zgedmd.c
+  lapacke_zgedmd_work.c
+  lapacke_zgedmdq.c
+  lapacke_zgedmdq_work.c
   lapacke_zgesv.c
   lapacke_zgesv_work.c
   lapacke_zgesvd.c
@@ -1925,12 +1933,8 @@ set(ZSRC
   lapacke_zggqrf_work.c
   lapacke_zggrqf.c
   lapacke_zggrqf_work.c
-  lapacke_zggsvd.c
-  lapacke_zggsvd_work.c
   lapacke_zggsvd3.c
   lapacke_zggsvd3_work.c
-  lapacke_zggsvp.c
-  lapacke_zggsvp_work.c
   lapacke_zggsvp3.c
   lapacke_zggsvp3_work.c
   lapacke_zgtcon.c
@@ -2343,6 +2347,8 @@ set(ZSRC
   lapacke_ztrsna_work.c
   lapacke_ztrsyl.c
   lapacke_ztrsyl_work.c
+  lapacke_ztrsyl3.c
+  lapacke_ztrsyl3_work.c
   lapacke_ztrtri.c
   lapacke_ztrtri_work.c
   lapacke_ztrtrs.c
@@ -2375,6 +2381,8 @@ set(ZSRC
   lapacke_zungtr_work.c
   lapacke_zungtsqr_row.c
   lapacke_zungtsqr_row_work.c
+  lapacke_zunhr_col.c
+  lapacke_zunhr_col_work.c
   lapacke_zunmbr.c
   lapacke_zunmbr_work.c
   lapacke_zunmhr.c
@@ -2401,6 +2409,12 @@ set(ZSRC
   lapacke_csyr_work.c
   lapacke_ilaver.c
 )
+if (BUILD_LAPACK_DEPRECATED)
+set(SRCS $SRCS lapacke_sgeqpf.c lapacke_sgeqpf_work.c lapacke_sggsvd.c lapacke_sggsvd_work.c lapacke_sggsvp.c lapacke_sggsvp_work.c)
+set(SRCD $SRCD lapacke_dgeqpf.c lapacke_dgeqpf_work.c lapacke_dggsvd.c lapacke_dggsvd_work.c lapacke_dggsvp.c lapacke_dggsvp_work.c)
+set(SRCC $SRCC lapacke_cgeqpf.c lapacke_cgeqpf_work.c lapacke_cggsvd.c lapacke_cggsvd_work.c lapacke_cggsvp.c lapacke_cggsvp_work.c)
+set(SRCZ $SRCZ lapacke_zgeqpf.c lapacke_zgeqpf_work.c lapacke_zggsvd.c lapacke_zggsvd_work.c lapacke_zggsvp.c lapacke_zggsvp_work.c)
+endif()
 
 set(SRCX
   lapacke_cgbrfsx.c       lapacke_cporfsx.c       lapacke_dgerfsx.c       lapacke_sgbrfsx.c       lapacke_ssyrfsx.c       lapacke_zherfsx.c
diff --git a/cmake/openblas.pc.in b/cmake/openblas.pc.in
index 7e120af86..9526d2df6 100644
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@@ -1,11 +1,13 @@
 libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+libnameprefix=@LIBNAMEPREFIX@
+libnamesuffix=@LIBNAMESUFFIX@
 libsuffix=@SUFFIX64_UNDERSCORE@
 includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 
 openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ 
 Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
-Version: @OPENBLAS_VERSION@
-URL: https://github.com/xianyi/OpenBLAS
-Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} 
+Version: @OpenBLAS_VERSION@
+URL: https://github.com/OpenMathLib/OpenBLAS
+Libs: @OpenMP_C_FLAGS@ -L${libdir} -l${libnameprefix}openblas${libnamesuffix}${libsuffix} 
 Cflags: -I${includedir}
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 631e7fe69..95f34e9f8 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -55,7 +55,7 @@ if (DEFINED TARGET)
 endif ()
 
 # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
-if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
+if (X86_64 AND NOT (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" OR ${CMAKE_C_COMPILER_ID} STREQUAL "NVHPC"))
   set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
 endif ()
 
@@ -280,7 +280,41 @@ if (DEFINED TARGET)
   if (${TARGET} STREQUAL POWER8)
     set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math")
   endif()
+
+if (${TARGET} STREQUAL NEOVERSEV1)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
+    else ()
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
+    else ()
+	    message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
+    endif()
+    endif()
+  endif()
+  if (${TARGET} STREQUAL NEOVERSEN2)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+	set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    else ()
+    execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+    if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
+    else ()
+	    message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
+    endif()
+    endif()
+  endif()
+  if (${TARGET} STREQUAL ARMV8SVE)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve")
+    else ()
+      set (KERNEL_DEFINITIONS  "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
+    endif()
+  endif()
+
 endif()
+
 if (DEFINED BINARY)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
 endif ()
@@ -467,10 +501,11 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_TH
 endif()
 endif()
 endif()
+  
+set(LIBPREFIX "lib${LIBNAMEPREFIX}openblas")
+
 if (DEFINED LIBNAMESUFFIX)
-  set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
-else ()
-  set(LIBPREFIX "libopenblas")
+  set(LIBPREFIX "${LIBNAMEPREFIX}_${LIBNAMESUFFIX}")
 endif ()
 
 if (NOT DEFINED SYMBOLPREFIX)
@@ -645,6 +680,10 @@ else ()
   endif ()
 endif ()
 
+if (DEFINED FIXED_LIBNAME)
+  set (LIBNAME "${LIBPREFIX}.${LIBSUFFIX}")
+  set (LIBNAME "${LIBPREFIX}_p.${LIBSUFFIX}")
+endif()
 
 set(LIBDLLNAME "${LIBPREFIX}.dll")
 set(LIBSONAME "${LIBNAME}.${LIBSUFFIX}.so")
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index c59e85d54..e94497a04 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -38,13 +38,15 @@ if(CMAKE_CL_64 OR MINGW64)
   endif()
 elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
   set(X86 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*"))
   set(POWER 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
   set(MIPS64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
   set(LOONGARCH64 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
+  set(RISCV64 1)
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*"))
   if (NOT BINARY)
     if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
       set(X86_64 1)
@@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
   endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
   set(X86 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
   if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
     set(ARM64 1)
   else()
@@ -107,7 +109,7 @@ else()
 endif ()
 
 if (NOT BINARY)
-  if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
+  if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc")))
     set(BINARY 64)
   else ()
     set(BINARY 32)
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 56c1cb060..5e8ba866b 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN)
       #message(STATUS "skipping ${makefile_line}")
       continue ()
     endif ()
+
+    # Example 1: SBGEMM_SMALL_M_PERMIT =
+    # Unset the variable
+    string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}")
+    if (NOT "${line_match}" STREQUAL "")
+      set(var_name ${CMAKE_MATCH_1})
+      unset(${var_name})
+    endif()
+
     string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
     if (NOT "${line_match}" STREQUAL "")
       #message(STATUS "match on ${line_match}")
diff --git a/common.h b/common.h
index 4eeeb8d55..1f0b9e533 100644
--- a/common.h
+++ b/common.h
@@ -396,7 +396,7 @@ typedef int blasint;
 #endif
 
 /***
-To alloc job_t on heap or statck.
+To alloc job_t on heap or stack.
 please https://github.com/xianyi/OpenBLAS/issues/246
 ***/
 #if defined(OS_WINDOWS)
@@ -482,6 +482,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_e2k.h"
 #endif
 
+#ifdef ARCH_CSKY
+#include "common_csky.h"
+#endif
+
 #ifndef ASSEMBLER
 #ifdef OS_WINDOWSSTORE
 typedef char env_var_t[MAX_PATH];
@@ -525,7 +529,7 @@ static inline unsigned long long rpcc(void){
 #endif // !RPCC_DEFINED
 
 #if !defined(BLAS_LOCK_DEFINED) && defined(__GNUC__)
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   do {
     while (*address) {YIELDING;};
diff --git a/common_alpha.h b/common_alpha.h
index 021eb93ae..e5380454a 100644
--- a/common_alpha.h
+++ b/common_alpha.h
@@ -45,7 +45,7 @@
 #define WMB asm("wmb")
 #define RMB asm("mb")
 
-static void __inline blas_lock(unsigned long *address){
+static __inline void blas_lock(unsigned long *address){
 #ifndef __DECC
   unsigned long tmp1, tmp2;
   asm volatile(
diff --git a/common_arm.h b/common_arm.h
index 682315de5..a3db9953c 100644
--- a/common_arm.h
+++ b/common_arm.h
@@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(ARMV6) || defined(ARMV7) || defined(ARMV8)
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   int register ret;
 
diff --git a/common_arm64.h b/common_arm64.h
index 6a18a294c..6ae6a35a3 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -55,7 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef ASSEMBLER
 
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   BLASULONG ret;
 
@@ -162,7 +162,11 @@ REALNAME:
 #define HUGE_PAGESIZE   ( 4 << 20)
 
 #ifndef BUFFERSIZE
+#if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE)
+#define BUFFER_SIZE     (32 << 22)
+#else
 #define BUFFER_SIZE     (32 << 20)
+#endif
 #else
 #define BUFFER_SIZE	(32 << BUFFERSIZE)
 #endif
diff --git a/common_csky.h b/common_csky.h
new file mode 100644
index 000000000..3095dc781
--- /dev/null
+++ b/common_csky.h
@@ -0,0 +1,56 @@
+/*****************************************************************************
+Copyright (c) 2011-2015, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#ifndef COMMON_CSKY
+#define COMMON_CSKY
+
+#define MB  __sync_synchronize()
+#define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#endif
+
+
+
+#define BUFFER_SIZE     ( 32 << 20)
+#define SEEK_ADDRESS
+
+#endif
\ No newline at end of file
diff --git a/common_interface.h b/common_interface.h
index 318827920..5a2e1654c 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *
 void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 
+void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
+	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
+void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
+	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
+void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
+	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
+void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
+	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
+
 int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
 		     float *, float  *, blasint *, float  *, blasint *,
 		     float *, float  *, blasint *);
@@ -764,8 +773,8 @@ xdouble   BLASFUNC(qlamc3)(xdouble *, xdouble *);
 
 void    BLASFUNC(saxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
 void    BLASFUNC(daxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
-void    BLASFUNC(caxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
-void    BLASFUNC(zaxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
+void    BLASFUNC(caxpby) (blasint *, void  *, float  *, blasint *, void *, float  *, blasint *);
+void    BLASFUNC(zaxpby) (blasint *, void  *, double *, blasint *, void *, double  *, blasint *);
 
 void    BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, double  *, blasint *);
diff --git a/common_loongarch64.h b/common_loongarch64.h
index e15539b5f..b1426da79 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -83,6 +83,19 @@ static inline int blas_quickdivide(blasint x, blasint y){
   return x / y;
 }
 
+#ifndef NO_AFFINITY
+static inline int WhereAmI(void){
+  int ret = 0, counter = 0;
+  __asm__ volatile (
+    "rdtimel.w  %[counter],   %[id]"
+    : [id]"=r"(ret), [counter]"=r"(counter)
+    :
+    : "memory"
+  );
+  return ret;
+}
+#endif
+
 #ifdef DOUBLE
 #define GET_IMAGE(res)  __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res)  : : "memory")
 #else
@@ -106,12 +119,50 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define MOV     fmov.d
 #define CMOVT   fsel
 #define MTC     movgr2fr.d
+#define MTG     movfr2gr.d
 #define FABS    fabs.d
+#define FMIN    fmin.d
+#define FMINA   fmina.d
+#define FMAX    fmax.d
+#define FMAXA   fmaxa.d
 #define CMPEQ   fcmp.ceq.d
 #define CMPLE   fcmp.cle.d
 #define CMPLT   fcmp.clt.d
 #define NEG     fneg.d
+#define FFINT   ffint.d.l
+
+#define XVFSUB  xvfsub.d
+#define XVFADD  xvfadd.d
+#define XVFMUL  xvfmul.d
+#define XVFMADD xvfmadd.d
+#define XVFMIN  xvfmin.d
+#define XVFMINA xvfmina.d
+#define XVFMAX  xvfmax.d
+#define XVFMAXA xvfmaxa.d
+#define XVCMPEQ xvfcmp.ceq.d
+#define XVCMPLE xvfcmp.cle.d
+#define XVCMPLT xvfcmp.clt.d
+#define XVMUL   xvfmul.d
+#define XVMSUB  xvfmsub.d
+#define XVNMSUB xvfnmsub.d
+
+#define VFSUB  vfsub.d
+#define VFADD  vfadd.d
+#define VFMUL  vfmul.d
+#define VFMADD vfmadd.d
+#define VFMIN  vfmin.d
+#define VFMINA vfmina.d
+#define VFMAX  vfmax.d
+#define VFMAXA vfmaxa.d
+#define VCMPEQ vfcmp.ceq.d
+#define VCMPLE vfcmp.cle.d
+#define VCMPLT vfcmp.clt.d
+#define VMUL   vfmul.d
+#define VMSUB  vfmsub.d
+#define VNMSUB vfnmsub.d
+
 #else
+
 #define LD      fld.s
 #define ST      fst.s
 #define MADD    fmadd.s
@@ -124,11 +175,48 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define MOV     fmov.s
 #define CMOVT   fsel
 #define MTC     movgr2fr.w
+#define MTG     movfr2gr.s
 #define FABS    fabs.s
+#define FMIN    fmin.s
+#define FMINA   fmina.s
+#define FMAX    fmax.s
+#define FMAXA   fmaxa.s
 #define CMPEQ   fcmp.ceq.s
 #define CMPLE   fcmp.cle.s
 #define CMPLT   fcmp.clt.s
 #define NEG     fneg.s
+#define FFINT   ffint.s.l
+
+#define XVFSUB  xvfsub.s
+#define XVFADD  xvfadd.s
+#define XVFMUL  xvfmul.s
+#define XVFMADD xvfmadd.s
+#define XVFMIN  xvfmin.s
+#define XVFMINA xvfmina.s
+#define XVFMAX  xvfmax.s
+#define XVFMAXA xvfmaxa.s
+#define XVCMPEQ xvfcmp.ceq.s
+#define XVCMPLE xvfcmp.cle.s
+#define XVCMPLT xvfcmp.clt.s
+#define XVMUL   xvfmul.s
+#define XVMSUB  xvfmsub.s
+#define XVNMSUB xvfnmsub.s
+
+#define VFSUB  vfsub.s
+#define VFADD  vfadd.s
+#define VFMUL  vfmul.s
+#define VFMADD vfmadd.s
+#define VFMIN  vfmin.s
+#define VFMINA vfmina.s
+#define VFMAX  vfmax.s
+#define VFMAXA vfmaxa.s
+#define VCMPEQ vfcmp.ceq.s
+#define VCMPLE vfcmp.cle.s
+#define VCMPLT vfcmp.clt.s
+#define VMUL   vfmul.s
+#define VMSUB  vfmsub.s
+#define VNMSUB vfnmsub.s
+
 #endif /* defined(DOUBLE) */
 
 #if defined(__64BIT__) && defined(USE64BITINT)
diff --git a/common_param.h b/common_param.h
index 1854570f1..c082d248e 100644
--- a/common_param.h
+++ b/common_param.h
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -45,12 +46,14 @@
 
 typedef struct {
   int dtb_entries;
+  int switch_ratio;
   int offsetA, offsetB, align;
 
 #if BUILD_BFLOAT16 == 1
   int sbgemm_p, sbgemm_q, sbgemm_r;
   int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
   int sbgemm_align_k;
+  int need_amxtile_permission;  // 0 default, 1 for device support amx.
 
   void   (*sbstobf16_k) (BLASLONG, float    *, BLASLONG, bfloat16 *, BLASLONG);
   void   (*sbdtobf16_k) (BLASLONG, double   *, BLASLONG, bfloat16 *, BLASLONG);
diff --git a/common_power.h b/common_power.h
index a49197fd7..3fe776f23 100644
--- a/common_power.h
+++ b/common_power.h
@@ -91,7 +91,7 @@
 
 void *qalloc(int flags, size_t bytes);
 
-static void INLINE blas_lock(volatile unsigned long *address){
+static INLINE void blas_lock(volatile unsigned long *address){
 
   long int ret, val = 1;
 
diff --git a/common_riscv64.h b/common_riscv64.h
index 7ddbe80a4..eccfc644f 100644
--- a/common_riscv64.h
+++ b/common_riscv64.h
@@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define BUFFER_SIZE     ( 32 << 20)
 #define SEEK_ADDRESS
 
-#if defined(C910V)
-#include <riscv_vector.h>
+#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
+# include <riscv_vector.h>
+#endif
+
+#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 )
+// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this
+#define RISCV_0p10_INTRINSICS
+#define RISCV_RVV(x) x
+#else
+#define RISCV_RVV(x) __riscv_ ## x
+#endif
+
+#if defined(C910V) || defined(RISCV64_ZVL256B)
+# if !defined(DOUBLE)
+#  define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v)
+# else
+#  define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v)
+# endif
+#else
+# define EXTRACT_FLOAT(v) (v[0])
 #endif
 
 #endif
diff --git a/common_sparc.h b/common_sparc.h
index 90a24ebf1..4b9e7840a 100644
--- a/common_sparc.h
+++ b/common_sparc.h
@@ -45,7 +45,7 @@
 
 #ifndef ASSEMBLER
 
-static void __inline blas_lock(volatile unsigned long *address){
+static __inline void blas_lock(volatile unsigned long *address){
 
   long int ret = 1;
 
diff --git a/common_thread.h b/common_thread.h
index 05e1d5489..d37fcb189 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads);
 /* Global Parameter */
 extern int blas_cpu_number;
 extern int blas_num_threads;
-extern int blas_num_threads_set;
 extern int blas_omp_linked;
 
 #define BLAS_LEGACY	0x8000U
@@ -112,8 +111,9 @@ typedef struct blas_queue {
   struct blas_queue *next;
 
 #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
-  CRITICAL_SECTION lock;
-  HANDLE finish;
+  // CRITICAL_SECTION lock;
+  // HANDLE finish;
+  volatile int finished;
 #else
   pthread_mutex_t	 lock;
   pthread_cond_t	 finished;
@@ -136,27 +136,32 @@ typedef struct blas_queue {
 #ifdef SMP_SERVER
 
 extern int blas_server_avail;
+extern int blas_omp_number_max;
+extern int blas_omp_threads_local;
 
 static __inline int num_cpu_avail(int level) {
 
 #ifdef USE_OPENMP
 int openmp_nthreads;
-	if (blas_num_threads_set == 0)
 	openmp_nthreads=omp_get_max_threads();
-	else
-	openmp_nthreads=blas_cpu_number;
+	if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local;
 #endif
 
 #ifndef USE_OPENMP 
   if (blas_cpu_number == 1
-#endif
-#ifdef USE_OPENMP
-     if (openmp_nthreads == 1 || omp_in_parallel()
+#else
+     if (openmp_nthreads == 1 
 #endif
       ) return 1;        
 
 #ifdef USE_OPENMP
-  if (blas_cpu_number != openmp_nthreads) {
+     if (openmp_nthreads > blas_omp_number_max){
+#ifdef DEBUG
+     fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
+#endif
+     openmp_nthreads = blas_omp_number_max;
+     }
+     if (blas_cpu_number != openmp_nthreads) {
 	  goto_set_num_threads(openmp_nthreads);
   }
 #endif
@@ -189,27 +194,27 @@ int exec_blas(BLASLONG num_cpu, blas_param_t *param, void *buffer);
 int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
 		       void *a, BLASLONG lda,
 		       void *b, BLASLONG ldb,
-		       void *c, BLASLONG ldc, int (*function)(), int threads);
+		       void *c, BLASLONG ldc, int (*function)(void), int threads);
 
-int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
+int gemm_thread_m (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *, void *, BLASLONG);
 
-int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
+int gemm_thread_n (int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG), void *, void *, BLASLONG);
 
-int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
+int gemm_thread_mn(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG);
 
-int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG, BLASLONG);
+int gemm_thread_variable(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *, void *, BLASLONG, BLASLONG);
 
 int trsm_thread(int mode, BLASLONG m, BLASLONG n,
 		double alpha_r, double alpha_i,
 		void *a, BLASLONG lda,
-		void *c, BLASLONG ldc, int (*function)(), void *buffer);
+		void *c, BLASLONG ldc, int (*function)(void), void *buffer);
 
-int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
+int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*, FLOAT *, FLOAT *, BLASLONG), void*, void*, BLASLONG);
 
 int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
 		 void *offsetA, BLASLONG lda,
 		 void *offsetB, BLASLONG jb,
-		 void *ipiv, BLASLONG offset, int (*function)(), void *buffer);
+		 void *ipiv, BLASLONG offset, int (*function)(void), void *buffer);
 
 #endif  /* ENDIF ASSEMBLER */
 
diff --git a/common_x86.h b/common_x86.h
index bc77eca58..65fb9a460 100644
--- a/common_x86.h
+++ b/common_x86.h
@@ -54,7 +54,7 @@
 #define	__volatile__
 #endif
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   int ret;
 
diff --git a/common_x86_64.h b/common_x86_64.h
index 729a055ce..dda168d6c 100644
--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -70,7 +70,7 @@
 #define RMB
 #endif
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
 	
 #ifndef C_MSVC
diff --git a/common_zarch.h b/common_zarch.h
index 442bae821..80609251b 100644
--- a/common_zarch.h
+++ b/common_zarch.h
@@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef ASSEMBLER
 
   /*
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
   BLASULONG ret;
 
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index 1080ea974..8c5d04c14 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -267,8 +267,10 @@ int detect(void)
 	}
 #else
 #ifdef __APPLE__
-	sysctlbyname("hw.cpufamily",&value,&length,NULL,0);
-	if (value ==131287967|| value == 458787763 ) return CPU_VORTEX;
+	sysctlbyname("hw.cpufamily",&value64,&length64,NULL,0);
+	if (value64 ==131287967|| value64 == 458787763 ) return CPU_VORTEX; //A12/M1
+	if (value64 == 3660830781) return CPU_VORTEX; //A15/M2
+	if (value64 == 2271604202) return CPU_VORTEX; //A16/M3
 #endif
 	return CPU_ARMV8;	
 #endif
diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
index ca07c7ffb..0ad32ae4e 100644
--- a/cpuid_loongarch64.c
+++ b/cpuid_loongarch64.c
@@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************************/
 
 #include <stdint.h>
+#include <sys/auxv.h>
 
 /*  If LASX extension instructions supported,
  *  using core LOONGSON3R5
@@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CPU_LOONGSON3R5    1
 #define CPU_LOONGSON2K1000 2
 
-#define LOONGARCH_CFG2  0x02
-#define LOONGARCH_LASX  1<<7
-#define LOONGARCH_LSX   1<<6
+#define LA_HWCAP_LSX    (1U << 4)
+#define LA_HWCAP_LASX   (1U << 5)
 
 static char *cpuname[] = {
   "LOONGSONGENERIC",
@@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
 
 int detect(void) {
 #ifdef __linux
-  uint32_t reg = 0;
+  int hwcap  = (int)getauxval(AT_HWCAP);
 
-  __asm__ volatile (
-    "cpucfg %0, %1 \n\t"
-    : "+&r"(reg)
-    : "r"(LOONGARCH_CFG2)
-  );
-
-  if (reg & LOONGARCH_LASX)
+  if (hwcap & LA_HWCAP_LASX)
     return CPU_LOONGSON3R5;
-  else if (reg & LOONGARCH_LSX)
+  else if (hwcap & LA_HWCAP_LSX)
     return CPU_LOONGSON2K1000;
   else
     return CPU_GENERIC;
@@ -100,7 +94,9 @@ void get_subdirname(void) {
 }
 
 void get_cpuconfig(void) {
+  uint32_t hwcaps = 0;
   int d = detect();
+
   switch (d) {
     case CPU_LOONGSON3R5:
       printf("#define LOONGSON3R5\n");
@@ -135,6 +131,10 @@ void get_cpuconfig(void) {
       printf("#define L2_ASSOCIATIVE 16\n");
     break;
   }
+
+  hwcaps = (uint32_t)getauxval( AT_HWCAP );
+  if (hwcaps & LA_HWCAP_LSX)      printf("#define HAVE_LSX\n");
+  if (hwcaps & LA_HWCAP_LASX)     printf("#define HAVE_LASX\n");
 }
 
 void get_libname(void){
diff --git a/cpuid_power.c b/cpuid_power.c
index 2526e8d0e..1ced8930a 100644
--- a/cpuid_power.c
+++ b/cpuid_power.c
@@ -160,6 +160,7 @@ int detect(void){
   infoCount = HOST_BASIC_INFO_COUNT;
   host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount);
 
+  if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4;
   if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4;
   if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970)  return CPUTYPE_PPC970;
 
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
index 894d2b873..ff7ba2aad 100644
--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#define CPU_GENERIC   0
-#define CPU_C910V     1
+#define CPU_GENERIC         0
+#define CPU_C910V           1
+#define CPU_x280            2
+#define CPU_RISCV64_ZVL256B 3
+#define CPU_RISCV64_ZVL128B 4
 
 static char *cpuname[] = {
   "RISCV64_GENERIC",
-  "C910V"
+  "C910V",
+  "x280",
+  "CPU_RISCV64_ZVL256B",
+  "CPU_RISCV64_ZVL128B"
+};
+
+static char *cpuname_lower[] = {
+  "riscv64_generic",
+  "c910v",
+  "x280",
+  "riscv64_zvl256b",
+  "riscv64_zvl128b"
 };
 
 int detect(void){
@@ -86,23 +100,29 @@ int detect(void){
   char *pmodel = NULL, *pisa = NULL;
 
   infile = fopen("/proc/cpuinfo", "r");
+  if (!infile)
+    return CPU_GENERIC;
   while (fgets(buffer, sizeof(buffer), infile)){
     if(!strncmp(buffer, "model name", 10)){
       strcpy(model_buffer, buffer);
-      pmodel = strchr(isa_buffer, ':') + 1;
+      pmodel = strchr(model_buffer, ':');
+      if (pmodel)
+        pmodel++;
     }
 
     if(!strncmp(buffer, "isa", 3)){
       strcpy(isa_buffer, buffer);
-      pisa = strchr(isa_buffer, '4') + 1;
+      pisa = strchr(isa_buffer, '4');
+      if (pisa)
+        pisa++;
     }
   }
 
   fclose(infile);
 
-  if (!pmodel)
+  if (!pmodel || !pisa)
    return(CPU_GENERIC);
-   
+
   if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
     return CPU_C910V;
 
@@ -140,5 +160,5 @@ void get_cpuconfig(void){
 }
 
 void get_libname(void){
-  printf("riscv64\n");
+  printf("%s", cpuname_lower[detect()]);
 }
diff --git a/cpuid_x86.c b/cpuid_x86.c
index ad13a8c8c..6cf4d6503 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -194,7 +194,7 @@ static C_INLINE void xgetbv(int op, int * eax, int * edx){
 }
 #endif
 
-int support_avx(){
+int support_avx(void){
 #ifndef NO_AVX
   int eax, ebx, ecx, edx;
   int ret=0;
@@ -212,7 +212,7 @@ int support_avx(){
 #endif
 }
 
-int support_avx2(){
+int support_avx2(void){
 #ifndef NO_AVX2
   int eax, ebx, ecx=0, edx;
   int ret=0;
@@ -228,7 +228,7 @@ int support_avx2(){
 #endif
 }
 
-int support_avx512(){
+int support_avx512(void){
 #if !defined(NO_AVX) && !defined(NO_AVX512)
   int eax, ebx, ecx, edx;
   int ret=0;
@@ -250,7 +250,7 @@ int support_avx512(){
 #endif
 }
 
-int support_avx512_bf16(){
+int support_avx512_bf16(void){
 #if !defined(NO_AVX) && !defined(NO_AVX512)
   int eax, ebx, ecx, edx;
   int ret=0;
@@ -271,7 +271,7 @@ int support_avx512_bf16(){
 #define BIT_AMX_BF16	0x00400000
 #define BIT_AMX_ENBD	0x00060000
 
-int support_amx_bf16() {
+int support_amx_bf16(void) {
 #if !defined(NO_AVX) && !defined(NO_AVX512)
   int eax, ebx, ecx, edx;
   int ret=0;
@@ -1479,6 +1479,8 @@ int get_cpuname(void){
           else
 	    return CPUTYPE_NEHALEM;
 	case 15: // Sapphire Rapids
+	  if(support_amx_bf16())
+	    return CPUTYPE_SAPPHIRERAPIDS;
 	  if(support_avx512_bf16())
             return CPUTYPE_COOPERLAKE;	
           if(support_avx512())
@@ -1549,6 +1551,7 @@ int get_cpuname(void){
           case 7: // Raptor Lake
           case 10:
           case 15:
+	  case 14: // Alder Lake N
             if(support_avx2())
               return CPUTYPE_HASWELL;
 	    if(support_avx())
@@ -1657,7 +1660,13 @@ int get_cpuname(void){
 	  else
 	    return CPUTYPE_BARCELONA;
         }
-      case 10: // Zen3		      
+      case 10: // Zen3/4
+#ifndef NO_AVX512
+          if(support_avx512_bf16())
+            return CPUTYPE_COOPERLAKE;
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+#endif
 	if(support_avx())
 #ifndef NO_AVX2
 	    return CPUTYPE_ZEN;
@@ -1845,7 +1854,8 @@ static char *cpuname[] = {
   "ZEN",
   "SKYLAKEX",
   "DHYANA",
-  "COOPERLAKE"
+  "COOPERLAKE",
+  "SAPPHIRERAPIDS",
 };
 
 static char *lowercpuname[] = {
@@ -1902,7 +1912,8 @@ static char *lowercpuname[] = {
   "zen",
   "skylakex",
   "dhyana",
-  "cooperlake"
+  "cooperlake",
+  "sapphirerapids",
 };
 
 static char *corename[] = {
@@ -1936,7 +1947,8 @@ static char *corename[] = {
   "ZEN",
   "SKYLAKEX",
   "DHYANA",
-  "COOPERLAKE"
+  "COOPERLAKE",
+  "SAPPHIRERAPIDS",
 };
 
 static char *corename_lower[] = {
@@ -1970,7 +1982,8 @@ static char *corename_lower[] = {
   "zen",
   "skylakex",
   "dhyana",
-  "cooperlake"
+  "cooperlake",
+  "sapphirerapids",
 };
 
 
@@ -2276,16 +2289,18 @@ int get_coretype(void){
             return CORE_NEHALEM;
 	}
         if (model == 15) { // Sapphire Rapids
+	  if(support_amx_bf16())
+	    return CORE_SAPPHIRERAPIDS;
 	  if(support_avx512_bf16())
-            return CPUTYPE_COOPERLAKE;	
+            return CORE_COOPERLAKE;	
           if(support_avx512())
-            return CPUTYPE_SKYLAKEX;
+            return CORE_SKYLAKEX;
           if(support_avx2())
-            return CPUTYPE_HASWELL;
+            return CORE_HASWELL;
           if(support_avx())
-	    return CPUTYPE_SANDYBRIDGE;
+	    return CORE_SANDYBRIDGE;
 	  else
-	  return CPUTYPE_NEHALEM;	
+	  return CORE_NEHALEM;
         }
       break;
 
@@ -2352,6 +2367,7 @@ int get_coretype(void){
 	  case 7: // Raptor Lake
           case 10:
           case 15:
+	  case 14: // Alder Lake N	  
 #ifndef NO_AVX2
 	  if(support_avx2())
             return CORE_HASWELL;
@@ -2428,6 +2444,12 @@ int get_coretype(void){
 	  // Ryzen 2
 	default:
 	  // Matisse,Renoir Ryzen2 models		
+#ifndef NO_AVX512
+          if(support_avx512_bf16())
+            return CORE_COOPERLAKE;
+          if(support_avx512())
+            return CORE_SKYLAKEX;
+#endif
 	  if(support_avx())
 #ifndef NO_AVX2
 	    return CORE_ZEN;
diff --git a/ctest.c b/ctest.c
index 2ccae8dcc..cbc15326f 100644
--- a/ctest.c
+++ b/ctest.c
@@ -173,6 +173,10 @@ HAVE_C11
 ARCH_E2K
 #endif
 
+#if defined(__csky__)
+ARCH_CSKY
+#endif
+
 #if defined(__EMSCRIPTEN__)
 ARCH_RISCV64
 OS_WINDOWS
diff --git a/ctest/CMakeLists.txt b/ctest/CMakeLists.txt
index 91338b73b..c56a78346 100644
--- a/ctest/CMakeLists.txt
+++ b/ctest/CMakeLists.txt
@@ -40,6 +40,10 @@ else()
     c_${float_char}blas1.c)
 endif()
   target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+        string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(x${float_char}cblat1 omp pthread)
+  endif()
   if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
       target_link_libraries(x${float_char}cblat1 m)
   endif()
@@ -65,6 +69,10 @@ else()
     constant.c)
 endif()
   target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+        string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(x${float_char}cblat2 omp pthread)
+  endif()
   if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
     target_link_libraries(x${float_char}cblat2 m)
   endif()
@@ -80,6 +88,17 @@ if (NOT NOFORTRAN)
     auxiliary.c
     c_xerbla.c
     constant.c)
+  if (USE_GEMM3M)
+  if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
+  add_executable(x${float_char}cblat3_3m
+    c_${float_char}blat3_3m.f
+    c_${float_char}blas3_3m.c
+    c_${float_char}3chke_3m.c
+    auxiliary.c
+    c_xerbla.c
+    constant.c)
+  endif()
+  endif()
 else()
   add_executable(x${float_char}cblat3
     c_${float_char}blat3c.c
@@ -88,12 +107,44 @@ else()
     auxiliary.c
     c_xerbla.c
     constant.c)
+  if (USE_GEMM3M)
+  if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
+  add_executable(x${float_char}cblat3_3m
+    c_${float_char}blat3c_3m.c
+    c_${float_char}blas3_3m.c
+    c_${float_char}3chke_3m.c
+    auxiliary.c
+    c_xerbla.c
+    constant.c)
+  endif()
+  endif()
 endif()
   target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+        string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(x${float_char}cblat3 omp pthread)
+  endif()
   if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
     target_link_libraries(x${float_char}cblat3 m)
   endif()
+  if (USE_GEMM3M)
+  if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
+  target_link_libraries(x${float_char}cblat3_3m ${OpenBLAS_LIBNAME})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+        string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(x${float_char}cblat3 omp pthread)
+  endif()
+  if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
+    target_link_libraries(x${float_char}cblat3_3m m)
+  endif()
+  endif()
+  endif()
   add_test(NAME "x${float_char}cblat3"
     COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
-
+    if (USE_GEMM3M)
+    if ((${float_char} STREQUAL "c") OR (${float_char} STREQUAL "z"))
+  add_test(NAME "x${float_char}cblat3_3m"
+    COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3_3m> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3_3m")
+    endif()
+  endif()
 endforeach()
diff --git a/ctest/Makefile b/ctest/Makefile
index 0fb2450d2..bbaf96f8e 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -5,6 +5,24 @@
 TOPDIR = ..
 include $(TOPDIR)/Makefile.system
 
+SUPPORT_GEMM3M = 0
+
+ifeq ($(ARCH), x86)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), x86_64)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), ia64)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), MIPS)
+SUPPORT_GEMM3M = 1
+endif
+
 override CFLAGS += -DADD$(BU) -DCBLAS
 ifeq ($(F_COMPILER),GFORTRAN)
 	override FFLAGS += -fno-tree-vectorize
@@ -144,9 +162,15 @@ all3targets += xdcblat3
 endif
 ifeq ($(BUILD_COMPLEX),1)
 all3targets += xccblat3
+ifeq ($(SUPPORT_GEMM3M),1)
+all3targets += xccblat3_3m
+endif
 endif
 ifeq ($(BUILD_COMPLEX16),1)
 all3targets += xzcblat3
+ifeq ($(SUPPORT_GEMM3M),1)
+all3targets += xzcblat3_3m
+endif
 endif
 
 all3: $(all3targets)
@@ -181,9 +205,9 @@ endif
 endif
 endif
 
-all3_3m: xzcblat3_3m xccblat3_3m
+ifeq ($(SUPPORT_GEMM3M),1)
 ifeq ($(USE_OPENMP), 1)
-ifeq ($(BUILD_SINGLE),1)
+ifeq ($(BUILD_COMPLEX),1)
 	OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
 endif
 ifeq ($(BUILD_COMPLEX16),1)
@@ -197,6 +221,7 @@ ifeq ($(BUILD_COMPLEX16),1)
 	OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
 endif
 endif
+endif
 
 
 
@@ -208,12 +233,20 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
-CEXTRALIB = -lomp
+CEXTRALIB += -lomp
 endif
 endif
 ifeq ($(F_COMPILER), NAG)
 CEXTRALIB = -lgomp
 endif
+ifeq ($(F_COMPILER), IBM)
+ifeq ($(C_COMPILER), GCC)
+CEXTRALIB += -lgomp
+endif
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB += -lomp
+endif
+endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)
@@ -263,8 +296,10 @@ xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+ifeq ($(SUPPORT_GEMM3M),1)
 xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+endif
 else
 xccblat1: $(ctestl1o) c_cblat1c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xccblat1 c_cblat1c.o $(ctestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) 
@@ -272,6 +307,10 @@ xccblat2: $(ctestl2o) c_cblat2c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xccblat2 c_cblat2c.o $(ctestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
 xccblat3: $(ctestl3o) c_cblat3c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xccblat3 c_cblat3c.o $(ctestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
+ifeq ($(SUPPORT_GEMM3M),1)
+xccblat3_3m: $(ctestl3o_3m) c_cblat3c_3m.o $(TOPDIR)/$(LIBNAME)
+	$(CC) $(CFLAGS) -o xccblat3_3m c_cblat3c_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+endif
 endif
 endif
 
@@ -285,8 +324,10 @@ xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+ifeq ($(SUPPORT_GEMM3M),1)
 xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
 	$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+endif
 else
 xzcblat1: $(ztestl1o) c_zblat1c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xzcblat1 c_zblat1c.o $(ztestl1o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB)) 
@@ -294,6 +335,10 @@ xzcblat2: $(ztestl2o) c_zblat2c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xzcblat2 c_zblat2c.o $(ztestl2o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
 xzcblat3: $(ztestl3o) c_zblat3c.o $(TOPDIR)/$(LIBNAME)
 	$(CC) $(CFLAGS) -o xzcblat3 c_zblat3c.o $(ztestl3o) $(LIB) $(CEXTRALIB) $(filter-out -lgfortran,$(EXTRALIB))
+ifeq ($(SUPPORT_GEMM3M),1)
+xzcblat3_3m: $(ztestl3o_3m) c_zblat3c_3m.o $(TOPDIR)/$(LIBNAME)
+	$(CC) $(CFLAGS) -o xzcblat3_3m c_zblat3c_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+endif
 endif
 endif
 
diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f
index 1a123d74d..cad7c7fa7 100644
--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@@ -96,7 +96,7 @@
       INTEGER           ICAMAXTEST
       EXTERNAL          SCASUMTEST, SCNRM2TEST, ICAMAXTEST
 *     .. External Subroutines ..
-      EXTERNAL          CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1
+      EXTERNAL          CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1
 *     .. Intrinsic Functions ..
       INTRINSIC         MAX
 *     .. Common blocks ..
@@ -214,8 +214,8 @@
                CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
      +                     STRUE4(NP1),SFAC)
             ELSE IF (ICASE.EQ.8) THEN
-*              .. CSCAL ..
-               CALL CSCAL(N,CA,CX,INCX)
+*              .. CSCALTEST ..
+               CALL CSCALTEST(N,CA,CX,INCX)
                CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
      +                    SFAC)
             ELSE IF (ICASE.EQ.9) THEN
@@ -236,14 +236,14 @@
 *
       INCX = 1
       IF (ICASE.EQ.8) THEN
-*        CSCAL
+*        CSCALTEST
 *        Add a test for alpha equal to zero.
          CA = (0.0E0,0.0E0)
          DO 80 I = 1, 5
             MWPCT(I) = (0.0E0,0.0E0)
             MWPCS(I) = (1.0E0,1.0E0)
    80    CONTINUE
-         CALL CSCAL(5,CA,CX,INCX)
+         CALL CSCALTEST(5,CA,CX,INCX)
          CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
       ELSE IF (ICASE.EQ.9) THEN
 *        CSSCALTEST
diff --git a/ctest/c_cblat1c.c b/ctest/c_cblat1c.c
index 8c0dd140c..d9a539097 100644
--- a/ctest/c_cblat1c.c
+++ b/ctest/c_cblat1c.c
@@ -242,251 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-#if 0
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-#endif
 
 /* Common Block Declarations */
 
@@ -503,16 +258,16 @@ static integer c__1 = 1;
 static integer c__5 = 5;
 static real c_b43 = (float)1.;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
     static real sfac = (float)9.765625e-4;
 
     /* Local variables */
-    extern /* Subroutine */ int check1_(), check2_();
+    extern /* Subroutine */ int check1_(real*), check2_(real*);
     static integer ic;
-    extern /* Subroutine */ int header_();
+    extern /* Subroutine */ int header_(void);
 
 /*     Test program for the COMPLEX    Level 1 CBLAS. */
 /*     Based upon the original CBLAS test routine together with: */
@@ -553,7 +308,7 @@ static real c_b43 = (float)1.;
 
 } /* MAIN__ */
 
-/* Subroutine */ int header_()
+/* Subroutine */ int header_(void)
 {
     /* Initialized data */
 
@@ -564,7 +319,7 @@ static real c_b43 = (float)1.;
     /* Format strings */
 
     /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
+    integer s_wsfe(void), do_fio(void), e_wsfe(void);
 
 /*     .. Parameters .. */
 /*     .. Scalars in Common .. */
@@ -577,8 +332,7 @@ static real c_b43 = (float)1.;
 
 } /* header_ */
 
-/* Subroutine */ int check1_(sfac)
-real *sfac;
+/* Subroutine */ int check1_(real* sfac)
 {
     /* Initialized data */
 
@@ -683,15 +437,16 @@ real *sfac;
 
     /* Local variables */
     static integer i__;
-    extern /* Subroutine */ int ctest_();
+    extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
     static complex mwpcs[5], mwpct[5];
-    extern /* Subroutine */ int itest1_(), stest1_();
+    extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
+    extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
     static complex cx[8];
-    extern real scnrm2test_();
+    extern real scnrm2test_(integer*, complex*, integer*);
     static integer np1;
-    extern integer icamaxtest_();
-    extern /* Subroutine */ int csscaltest_();
-    extern real scasumtest_();
+    extern integer icamaxtest_(integer*, complex*, integer*);
+    extern /* Subroutine */ int csscaltest_(integer*, real*, complex*, integer*);
+    extern real scasumtest_(integer*, complex*, integer*);
     static integer len;
 
 /*     .. Parameters .. */
@@ -727,7 +482,7 @@ real *sfac;
 		stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
 	    } else if (combla_1.icase == 8) {
 /*              .. CSCAL .. */
-		cscal_(&combla_1.n, &ca, cx, &combla_1.incx);
+		cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx);
 		ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
 			 &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
 	    } else if (combla_1.icase == 9) {
@@ -761,7 +516,7 @@ real *sfac;
 	    mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
 /* L80: */
 	}
-	cscal_(&c__5, &ca, cx, &combla_1.incx);
+	cscaltest_(&c__5, &ca, cx, &combla_1.incx);
 	ctest_(&c__5, cx, mwpct, mwpcs, sfac);
     } else if (combla_1.icase == 9) {
 /*        CSSCALTEST */
@@ -808,8 +563,7 @@ real *sfac;
     return 0;
 } /* check1_ */
 
-/* Subroutine */ int check2_(sfac)
-real *sfac;
+/* Subroutine */ int check2_(real* sfac)
 {
     /* Initialized data */
 
@@ -981,10 +735,10 @@ real *sfac;
     static complex cdot[1];
     static integer lenx, leny, i__;
     static complex ctemp;
-    extern /* Subroutine */ int ctest_();
+    extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
     static integer ksize;
-    extern /* Subroutine */ int cdotctest_(), ccopytest_(), cdotutest_(), 
-	    cswaptest_(), caxpytest_();
+    extern /* Subroutine */ int cdotctest_(integer*, complex*, integer*, complex*, integer*,complex*), ccopytest_(integer*, complex*, integer*, complex*, integer*), cdotutest_(integer*, complex*, integer*, complex*, integer*, complex*), 
+	    cswaptest_(integer*, complex*, integer*, complex*, integer*), caxpytest_(integer*, complex*, complex*, integer*, complex*, integer*);
     static integer ki, kn;
     static complex cx[7], cy[7];
     static integer mx, my;
@@ -1067,9 +821,7 @@ real *sfac;
     return 0;
 } /* check2_ */
 
-/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac)
-integer *len;
-real *scomp, *strue, *ssize, *sfac;
+/* Subroutine */ int stest_(integer* len, real* scomp, real* strue, real* ssize,real* sfac)
 {
     /* System generated locals */
     integer i__1;
@@ -1077,7 +829,7 @@ real *scomp, *strue, *ssize, *sfac;
 
     /* Local variables */
     static integer i__;
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*, real*);
     static real sd;
 
 /*     ********************************* STEST ************************** */
@@ -1133,11 +885,10 @@ L40:
 
 } /* stest_ */
 
-/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac)
-real *scomp1, *strue1, *ssize, *sfac;
+/* Subroutine */ int stest1_(real* scomp1, real* strue1, real* ssize, real* sfac)
 {
     static real scomp[1], strue[1];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(integer*, real*, real*, real*, real*);
 
 /*     ************************* STEST1 ***************************** */
 
@@ -1164,8 +915,7 @@ real *scomp1, *strue1, *ssize, *sfac;
     return 0;
 } /* stest1_ */
 
-doublereal sdiff_(sa, sb)
-real *sa, *sb;
+doublereal sdiff_(real* sa, real* sb)
 {
     /* System generated locals */
     real ret_val;
@@ -1179,10 +929,7 @@ real *sa, *sb;
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int ctest_(len, ccomp, ctrue, csize, sfac)
-integer *len;
-complex *ccomp, *ctrue, *csize;
-real *sfac;
+/* Subroutine */ int ctest_(integer* len, complex* ccomp, complex* ctrue, complex* csize, real* sfac)
 {
     /* System generated locals */
     integer i__1, i__2;
@@ -1193,7 +940,7 @@ real *sfac;
     /* Local variables */
     static integer i__;
     static real scomp[20], ssize[20], strue[20];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(integer*, real*,real*,real*,real*);
 
 /*     **************************** CTEST ***************************** */
 
@@ -1231,8 +978,7 @@ real *sfac;
     return 0;
 } /* ctest_ */
 
-/* Subroutine */ int itest1_(icomp, itrue)
-integer *icomp, *itrue;
+/* Subroutine */ int itest1_(integer* icomp, integer* itrue)
 {
     /* Local variables */
     static integer id;
diff --git a/ctest/c_cblat2c.c b/ctest/c_cblat2c.c
index b7616b11d..c0cf0b9e2 100644
--- a/ctest/c_cblat2c.c
+++ b/ctest/c_cblat2c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -396,7 +273,7 @@ static integer c_n1 = -1;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
@@ -414,17 +291,21 @@ static logical c_false = FALSE_;
     static logical same;
     static integer ninc, nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int cchk1_(), cchk2_(), cchk3_(), cchk4_(), 
-	    cchk5_(), cchk6_();
+    extern /* Subroutine */ int cchk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ int cchk6_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, complex*, integer*, integer*, integer*, integer*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, complex*, real*, complex*, integer*, ftnlen);
     static complex a[4225]	/* was [65][65] */;
     static real g[65];
     static integer i__, j, n;
     static logical fatal;
     static complex x[65], y[65], z__[130];
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*, real*);
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static char snaps[32], trans[1];
     static integer isnum;
     static logical ltest[17];
@@ -438,11 +319,11 @@ static logical c_false = FALSE_;
     static char snamet[12];
     static real thresh;
     static logical rorder;
-    extern /* Subroutine */ int cc2chke_();
+    extern /* Subroutine */ void cc2chke_(char*, ftnlen);
     static integer layout;
     static logical ltestt, tsterr;
     static complex alf[7];
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static integer inc[7], nkb;
     static complex bet[7];
     static real eps, err;
@@ -983,22 +864,7 @@ L240:
 
 } /* MAIN__ */
 
-/* Subroutine */ int cchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-complex *alf;
-integer *nbet;
-complex *bet;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, complex* alf, integer* nbet, complex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1015,10 +881,10 @@ ftnlen sname_len;
     static integer incx, incy;
     static logical full, tran, null;
     static integer i__, m, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static complex alpha;
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys;
@@ -1026,14 +892,15 @@ ftnlen sname_len;
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns;
-    extern /* Subroutine */ int ccgbmv_(), ccgemv_();
-    extern logical lceres_();
+    extern /* Subroutine */ int ccgbmv_(integer*, char*, integer*, integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ void ccgemv_(integer*, char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen);
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
     static char ctrans[14];
     static real errmax;
     static complex transl;
     static char transs[1];
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static complex als, bls;
     static real err;
     static integer iku, kls, kus;
@@ -1448,22 +1315,7 @@ L140:
 
 } /* cchk1_ */
 
-/* Subroutine */ int cchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-complex *alf;
-integer *nbet;
-complex *bet;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, complex* alf, integer* nbet, complex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1481,10 +1333,10 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, k, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static complex alpha;
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -1495,13 +1347,14 @@ ftnlen sname_len;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, iy, ns, lx, ly;
-    extern /* Subroutine */ int cchbmv_(), cchemv_();
-    extern logical lceres_();
-    extern /* Subroutine */ int cchpmv_();
+    extern /* Subroutine */ void cchbmv_(integer*, char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ void cchemv_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, ftnlen);
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cchpmv_(integer*, char*, integer*, complex*, complex*, complex*, integer*, complex*, complex*, integer*, ftnlen);
     static real errmax;
     static complex transl;
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static complex als, bls;
     static real err;
 
@@ -1906,19 +1759,7 @@ L130:
 
 } /* cchk2_ */
 
-/* Subroutine */ int cchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, xt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *xt;
-real *g;
-complex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* xt, real* g, complex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1937,10 +1778,10 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1], cdiag[14];
     static integer i__, k, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static char diags[1];
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -1950,17 +1791,19 @@ ftnlen sname_len;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, ns, lx;
-    extern logical lceres_();
-    extern /* Subroutine */ int cctbmv_(), cctbsv_();
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cctbmv_(integer*, char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cctbsv_(integer*, char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen);
     static char ctrans[14];
-    extern /* Subroutine */ int cctpmv_();
+    extern /* Subroutine */ void cctpmv_(integer*, char*, char*, char*, integer*, complex*, complex*, integer*, ftnlen, ftnlen, ftnlen);
     static real errmax;
-    extern /* Subroutine */ int cctrmv_(), cctpsv_();
+    extern /* Subroutine */ void cctrmv_(integer*, char*, char*, char*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cctpsv_(integer*, char*, char*, char*, integer*, complex*, complex*, integer*, ftnlen, ftnlen, ftnlen);
     static complex transl;
-    extern /* Subroutine */ int cctrsv_();
+    extern /* Subroutine */ void cctrsv_(integer*, char*, char*, char*, integer*, complex*, integer*, complex*, integer*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, icd, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static integer ict, icu;
     static real err;
 
@@ -2418,21 +2261,7 @@ L130:
 
 } /* cchk3_ */
 
-/* Subroutine */ int cchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-complex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-complex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
@@ -2444,21 +2273,21 @@ ftnlen sname_len;
     static integer incx, incy;
     static logical null;
     static integer i__, j, m, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static complex alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys, ia, nc, nd, im, in;
-    extern /* Subroutine */ int ccgerc_();
+    extern /* Subroutine */ void ccgerc_(integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*);
     static integer ms, ix, iy, ns, lx, ly;
-    extern /* Subroutine */ int ccgeru_();
-    extern logical lceres_();
+    extern /* Subroutine */ void ccgeru_(integer*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*);
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
     static real errmax;
     static complex transl;
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static complex als;
     static real err;
 
@@ -2786,21 +2615,7 @@ L150:
 
 } /* cchk4_ */
 
-/* Subroutine */ int cchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-complex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-complex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2818,10 +2633,12 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, j, n;
-    extern /* Subroutine */ int cmake_(), ccher_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void ccher_(integer*, char*, integer*, real*, complex*, integer*, complex*, integer*, ftnlen);
     static complex alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int cchpr_(), cmvch_();
+    extern /* Subroutine */ void cchpr_(integer*, char*, integer*, real*, complex*, integer*, complex*, ftnlen);
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -2832,11 +2649,11 @@ ftnlen sname_len;
     static logical packed;
     static integer ix, ns, lx;
     static real ralpha;
-    extern logical lceres_();
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
     static real errmax;
     static complex transl;
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static real err;
 
 /*  Tests CHER and CHPR. */
@@ -3160,21 +2977,7 @@ L130:
 
 } /* cchk5_ */
 
-/* Subroutine */ int cchk6_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-complex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-complex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-real *g;
-complex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int cchk6_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, complex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, complex* a, complex* aa, complex* as, complex* x, complex* xx, complex* xs, complex* y, complex* yy, complex* ys, complex* yt, real* g, complex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -3192,25 +2995,26 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, j, n;
-    extern /* Subroutine */ int cmake_();
+    extern /* Subroutine */ int cmake_(char*, char*, char*, integer*, integer*, complex*, integer*, complex*, integer*, integer*, integer*, logical*, complex*, ftnlen, ftnlen, ftnlen);
     static complex alpha, w[2];
     static logical isame[13];
-    extern /* Subroutine */ int cmvch_();
+    extern /* Subroutine */ int cmvch_(char*, integer*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, complex*, integer*, complex*, real*, complex*, real*, real*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ int ccher2_(), cchpr2_();
+    extern /* Subroutine */ void ccher2_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, integer*, ftnlen);
+    extern /* Subroutine */ void cchpr2_(integer*, char*, integer*, complex*, complex*, integer*, complex*, integer*, complex*, ftnlen);
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, iy, ns, lx, ly;
-    extern logical lceres_();
+    extern logical lceres_(char*, char*, integer*, integer*, complex*, complex*, integer*, ftnlen, ftnlen);
     static real errmax;
     static complex transl;
     static integer laa, lda;
-    extern logical lce_();
+    extern logical lce_(complex*, complex*, integer*);
     static complex als;
     static real err;
 
@@ -3597,24 +3401,7 @@ L170:
 
 } /* cchk6_ */
 
-/* Subroutine */ int cmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, 
-	incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len)
-char *trans;
-integer *m, *n;
-complex *alpha, *a;
-integer *nmax;
-complex *x;
-integer *incx;
-complex *beta, *y;
-integer *incy;
-complex *yt;
-real *g;
-complex *yy;
-real *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen trans_len;
+/* Subroutine */ int cmvch_(char* trans, integer* m, integer* n, complex* alpha, complex* a, integer* nmax, complex* x, integer* incx, complex* beta, complex* y, integer* incy, complex* yt, real* g, complex* yy, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len)
 {
 
     /* System generated locals */
@@ -3812,9 +3599,7 @@ L80:
 
 } /* cmvch_ */
 
-logical lce_(ri, rj, lr)
-complex *ri, *rj;
-integer *lr;
+logical lce_(complex* ri, complex* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1, i__2, i__3;
@@ -3861,13 +3646,7 @@ L30:
 
 } /* lce_ */
 
-logical lceres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-complex *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lceres_(char* type__, char* uplo, integer* m, integer* n, complex* aa, complex* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
@@ -3960,9 +3739,7 @@ L80:
 
 } /* lceres_ */
 
-/* Complex */ VOID cbeg_( ret_val, reset)
-complex * ret_val;
-logical *reset;
+/* Complex */ VOID cbeg_(complex* ret_val, logical* reset)
 {
     /* System generated locals */
     real r__1, r__2;
@@ -4023,8 +3800,7 @@ L10:
 
 } /* cbeg_ */
 
-doublereal sdiff_(x, y)
-real *x, *y;
+doublereal sdiff_(real* x, real* y)
 {
     /* System generated locals */
     real ret_val;
@@ -4044,19 +3820,7 @@ real *x, *y;
 
 } /* sdiff_ */
 
-/* Subroutine */ int cmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, 
-	ku, reset, transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-complex *a;
-integer *nmax;
-complex *aa;
-integer *lda, *kl, *ku;
-logical *reset;
-complex *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int cmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, complex* a, integer* nmax, complex* aa, integer* lda, integer* kl, integer* ku, logical* reset, complex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -4064,7 +3828,7 @@ ftnlen diag_len;
     complex q__1, q__2;
 
     /* Local variables */
-    extern /* Complex */ VOID cbeg_();
+    extern /* Complex */ VOID cbeg_(complex*, logical*);
     static integer ibeg, iend, ioff;
     static logical unit;
     static integer i__, j;
diff --git a/ctest/c_cblat3c.c b/ctest/c_cblat3c.c
index 1f4b967b0..5ad9b8bd8 100644
--- a/ctest/c_cblat3c.c
+++ b/ctest/c_cblat3c.c
@@ -242,130 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Common Block Declarations */
diff --git a/ctest/c_cblat3c_3m.c b/ctest/c_cblat3c_3m.c
new file mode 100644
index 000000000..b5d6bf9cb
--- /dev/null
+++ b/ctest/c_cblat3c_3m.c
@@ -0,0 +1,3942 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#include "common.h"
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+
+
+/* Common Block Declarations */
+
+struct {
+    integer infot, noutc;
+    logical ok, lerr;
+} infoc_;
+
+#define infoc_1 infoc_
+
+struct {
+    char srnamt[12];
+} srnamc_;
+
+#define srnamc_1 srnamc_
+
+/* Table of constant values */
+
+static complex c_b1 = {0.f,0.f};
+static complex c_b2 = {1.f,0.f};
+static integer c__1 = 1;
+static integer c__65 = 65;
+static integer c__6 = 6;
+static real c_b91 = 1.f;
+static logical c_true = TRUE_;
+static integer c__0 = 0;
+static logical c_false = FALSE_;
+
+int /* Main program */ main(void)
+{
+    /* Initialized data */
+
+    static char snames[9][13] = {"cblas_cgemm3m ", "cblas_chemm ", "cblas_csymm ", 
+	    "cblas_ctrmm ", "cblas_ctrsm ", "cblas_cherk ", "cblas_csyrk ", 
+	    "cblas_cher2k", "cblas_csyr2k"};
+
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4, i__5;
+    real r__1;
+
+    /* Local variables */
+    integer nalf, idim[9];
+    logical same;
+    integer nbet, ntra;
+    logical rewi;
+    extern /* Subroutine */ int cchk1_(char *, real *, real *, integer *, 
+	    integer *, logical *, logical *, logical *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, complex *, 
+	    complex *, complex *, complex *, real *, integer *), 
+	    cchk2_(char *, real *, real *, integer *, integer *, logical *, 
+	    logical *, logical *, integer *, integer *, integer *, complex *, 
+	    integer *, complex *, integer *, complex *, complex *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, complex *, 
+	    complex *, real *, integer *), cchk3_(char *, real *, 
+	    real *, integer *, integer *, logical *, logical *, logical *, 
+	    integer *, integer *, integer *, complex *, integer *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, complex *, 
+	    real *, complex *, integer *), cchk4_(char *, real *, 
+	    real *, integer *, integer *, logical *, logical *, logical *, 
+	    integer *, integer *, integer *, complex *, integer *, complex *, 
+	    integer *, complex *, complex *, complex *, complex *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, real *, 
+	    integer *), cchk5_(char *, real *, real *, integer *, 
+	    integer *, logical *, logical *, logical *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, complex *, 
+	    complex *, complex *, complex *, complex *, complex *, complex *, 
+	    complex *, complex *, real *, complex *, integer *);
+    complex c__[4225]	/* was [65][65] */;
+    real g[65];
+    integer i__, j, n;
+    logical fatal;
+    complex w[130];
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    extern real sdiff_(real *, real *);
+    logical trace;
+    integer nidim;
+    char snaps[32];
+    integer isnum;
+    logical ltest[9];
+    complex aa[4225], ab[8450]	/* was [65][130] */, bb[4225], cc[4225], as[
+	    4225], bs[4225], cs[4225], ct[65];
+    logical sfatal, corder;
+    char snamet[12], transa[1], transb[1];
+    real thresh;
+    logical rorder;
+    extern /* Subroutine */ int cc3chke_(char *);
+    integer layout;
+    logical ltestt, tsterr;
+    complex alf[7];
+    extern logical lce_(complex *, complex *, integer *);
+    complex bet[7];
+    real eps, err;
+    char tmpchar;
+
+/*  Test program for the COMPLEX          Level 3 Blas. */
+
+/*  The program must be driven by a short data file. The first 13 records */
+/*  of the file are read using list-directed input, the last 9 records */
+/*  are read using the format ( A12, L2 ). An annotated example of a data */
+/*  file can be obtained by deleting the first 3 characters from the */
+/*  following 22 lines: */
+/*  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE */
+/*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) */
+/*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. */
+/*  F        LOGICAL FLAG, T TO STOP ON FAILURES. */
+/*  T        LOGICAL FLAG, T TO TEST ERROR CALL MYEXITS. */
+/*  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH */
+/*  16.0     THRESHOLD VALUE OF TEST RATIO */
+/*  6                 NUMBER OF VALUES OF N */
+/*  0 1 2 3 5 9       VALUES OF N */
+/*  3                 NUMBER OF VALUES OF ALPHA */
+/*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA */
+/*  3                 NUMBER OF VALUES OF BETA */
+/*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA */
+/*  cblas_cgemm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_chemm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_csymm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_ctrmm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_ctrsm  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_cherk  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_csyrk  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_cher2k T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  cblas_csyr2k T PUT F FOR NO TEST. SAME COLUMNS. */
+
+/*  See: */
+
+/*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. */
+/*     A Set of Level 3 Basic Linear Algebra Subprograms. */
+
+/*     Technical Memorandum No.88 (Revision 1), Mathematics and */
+/*     Computer Science Division, Argonne National Laboratory, 9700 */
+/*     South Cass Avenue, Argonne, Illinois 60439, US. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+
+    infoc_1.noutc = 6;
+
+/*     Read name and unit number for snapshot output file and open file. */
+    char line[80];
+    
+    fgets(line,80,stdin);
+    sscanf(line,"'%s'",snaps);
+    fgets(line,80,stdin);
+#ifdef USE64BITINT
+    sscanf(line,"%ld",&ntra);
+#else
+    sscanf(line,"%d",&ntra);
+#endif
+    trace = ntra >= 0;
+    if (trace) {
+/*	o__1.oerr = 0;
+	o__1.ounit = ntra;
+	o__1.ofnmlen = 32;
+	o__1.ofnm = snaps;
+	o__1.orl = 0;
+	o__1.osta = 0;
+	o__1.oacc = 0;
+	o__1.ofm = 0;
+	o__1.oblnk = 0;
+	f_open(&o__1);*/
+    }
+/*     Read the flag that directs rewinding of the snapshot file. */
+   fgets(line,80,stdin);
+   sscanf(line,"%d",&rewi);
+   rewi = rewi && trace;
+/*     Read the flag that directs stopping on any failure. */
+   fgets(line,80,stdin);
+   sscanf(line,"%c",&tmpchar);
+   sfatal=FALSE_;
+   if (tmpchar=='T')sfatal=TRUE_;
+   fgets(line,80,stdin);
+   sscanf(line,"%c",&tmpchar);
+   tsterr=FALSE_;
+   if (tmpchar=='T')tsterr=TRUE_;
+   fgets(line,80,stdin);
+   sscanf(line,"%d",&layout);
+   fgets(line,80,stdin);
+   sscanf(line,"%f",&thresh);
+
+
+/*     Read and check the parameter values for the tests. */
+
+/*     Values of N */
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nidim);
+#else
+   sscanf(line,"%d",&nidim);
+#endif
+
+    if (nidim < 1 || nidim > 9) {
+	fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9");
+	goto L220;
+    }
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2],
+    &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]);
+#else
+   sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2],
+    &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]);
+#endif
+    i__1 = nidim;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) {
+	fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n");
+	    goto L220;
+	}
+/* L10: */
+    }
+/*     Values of ALPHA */
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nalf);
+#else
+   sscanf(line,"%d",&nalf);
+#endif
+    if (nalf < 1 || nalf > 7) {
+	fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n");
+	goto L220;
+    }
+   fgets(line,80,stdin);
+   sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i,
+   &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i);
+
+//    i__1 = nalf;
+//    for (i__ = 1; i__ <= i__1; ++i__) {
+//	do_lio(&c__6, &c__1, (char *)&alf[i__ - 1], (ftnlen)sizeof(complex));
+//    }
+/*     Values of BETA */
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nbet);
+#else
+   sscanf(line,"%d",&nbet);
+#endif
+    if (nalf < 1 || nbet > 7) {
+	fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n");
+	goto L220;
+    }
+   fgets(line,80,stdin);
+   sscanf(line,"(%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f) (%f,%f)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i,
+   &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i);
+
+
+/*     Report values of parameters. */
+
+    printf("TESTS OF THE COMPLEX    LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n");
+    printf(" FOR N");
+    for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]);
+    printf("\n");    
+    printf(" FOR ALPHA");
+    for (i__ =1; i__ <=nalf;++i__) printf(" (%f,%f)",alf[i__-1].r,alf[i__-1].i);
+    printf("\n");    
+    printf(" FOR BETA");
+    for (i__ =1; i__ <=nbet;++i__) printf(" (%f,%f)",bet[i__-1].r,bet[i__-1].i);
+    printf("\n");    
+
+    if (! tsterr) {
+      printf(" ERROR-EXITS WILL NOT BE TESTED\n"); 
+    }
+    printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %f\n",thresh);
+    rorder = FALSE_;
+    corder = FALSE_;
+    if (layout == 2) {
+	rorder = TRUE_;
+	corder = TRUE_;
+        printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n");
+    } else if (layout == 1) {
+	rorder = TRUE_;
+        printf("ROW-MAJOR DATA LAYOUT IS TESTED\n");
+    } else if (layout == 0) {
+	corder = TRUE_;
+        printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n");
+    }
+
+/*     Read names of subroutines and flags which indicate */
+/*     whether they are to be tested. */
+
+    for (i__ = 1; i__ <= 9; ++i__) {
+	ltest[i__ - 1] = FALSE_;
+/* L20: */
+    }
+L30:
+   if (! fgets(line,80,stdin)) {
+	goto L60;
+    }
+   i__1 = sscanf(line,"%12c %c",snamet,&tmpchar);
+   ltestt=FALSE_;
+   if (tmpchar=='T')ltestt=TRUE_;
+    if (i__1 < 2) {
+	goto L60;
+    }
+    for (i__ = 1; i__ <= 9; ++i__) {
+	if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == 
+		0) {
+	    goto L50;
+	}
+/* L40: */
+    }
+    printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet);
+    exit(1);
+L50:
+    ltest[i__ - 1] = ltestt;
+    goto L30;
+
+L60:
+/*    cl__1.cerr = 0;
+    cl__1.cunit = 5;
+    cl__1.csta = 0;
+    f_clos(&cl__1);*/
+
+/*     Compute EPS (the machine precision). */
+
+    eps = 1.f;
+L70:
+    r__1 = eps + 1.f;
+    if (sdiff_(&r__1, &c_b91) == 0.f) {
+	goto L80;
+    }
+    eps *= .5f;
+    goto L70;
+L80:
+    eps += eps;
+    printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps);
+
+/*     Check the reliability of CMMCH using exact data. */
+
+    n = 32;
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * 65 - 66;
+/* Computing MAX */
+	    i__5 = i__ - j + 1;
+	    i__4 = f2cmax(i__5,0);
+	    ab[i__3].r = (real) i__4, ab[i__3].i = 0.f;
+/* L90: */
+	}
+	i__2 = j + 4224;
+	ab[i__2].r = (real) j, ab[i__2].i = 0.f;
+	i__2 = (j + 65) * 65 - 65;
+	ab[i__2].r = (real) j, ab[i__2].i = 0.f;
+	i__2 = j - 1;
+	c__[i__2].r = 0.f, c__[i__2].i = 0.f;
+/* L100: */
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j - 1;
+	i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3;
+	cc[i__2].r = (real) i__3, cc[i__2].i = 0.f;
+/* L110: */
+    }
+/*     CC holds the exact result. On exit from CMMCH CT holds */
+/*     the result computed by CMMCH. */
+    *(unsigned char *)transa = 'N';
+    *(unsigned char *)transb = 'N';
+    cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lce_(cc, ct, &n);
+    if (! same || err != 0.f) {
+      printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
+    }
+    *(unsigned char *)transb = 'C';
+    cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lce_(cc, ct, &n);
+    if (! same || err != 0.f) {
+      printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j + 4224;
+	i__3 = n - j + 1;
+	ab[i__2].r = (real) i__3, ab[i__2].i = 0.f;
+	i__2 = (j + 65) * 65 - 65;
+	i__3 = n - j + 1;
+	ab[i__2].r = (real) i__3, ab[i__2].i = 0.f;
+/* L120: */
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = n - j;
+	i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3;
+	cc[i__2].r = (real) i__3, cc[i__2].i = 0.f;
+/* L130: */
+    }
+    *(unsigned char *)transa = 'C';
+    *(unsigned char *)transb = 'N';
+    cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lce_(cc, ct, &n);
+    if (! same || err != 0.f) {
+      printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
+    }
+    *(unsigned char *)transb = 'C';
+    cmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true);
+    same = lce_(cc, ct, &n);
+    if (! same || err != 0.f) {
+      printf("ERROR IN CMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("CMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
+    }
+
+/*     Test each subroutine in turn. */
+
+    for (isnum = 1; isnum <= 9; ++isnum) {
+	if (! ltest[isnum - 1]) {
+/*           Subprogram is not to be tested. */
+           printf("%12s WAS NOT TESTED\n",snames[isnum-1]);
+	} else {
+	    s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, (
+		    ftnlen)12);
+/*           Test error exits. */
+	    if (tsterr) {
+		cc3chke_(snames[isnum - 1]);
+	    }
+/*           Test computations. */
+	    infoc_1.infot = 0;
+	    infoc_1.ok = TRUE_;
+	    fatal = FALSE_;
+	    switch (isnum) {
+		case 1:  goto L140;
+		case 2:  goto L150;
+		case 3:  goto L150;
+		case 4:  goto L160;
+		case 5:  goto L160;
+		case 6:  goto L170;
+		case 7:  goto L170;
+		case 8:  goto L180;
+		case 9:  goto L180;
+	    }
+/*           Test CGEMM, 01. */
+L140:
+	    if (corder) {
+		cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0);
+	    }
+	    if (rorder) {
+		cchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1);
+	    }
+	    goto L190;
+/*           Test CHEMM, 02, CSYMM, 03. */
+L150:
+	    if (corder) {
+		cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0);
+	    }
+	    if (rorder) {
+		cchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1);
+	    }
+	    goto L190;
+/*           Test CTRMM, 04, CTRSM, 05. */
+L160:
+	    if (corder) {
+		cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
+			c__0);
+	    }
+	    if (rorder) {
+		cchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
+			c__1);
+	    }
+	    goto L190;
+/*           Test CHERK, 06, CSYRK, 07. */
+L170:
+	    if (corder) {
+		cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0);
+	    }
+	    if (rorder) {
+		cchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1);
+	    }
+	    goto L190;
+/*           Test CHER2K, 08, CSYR2K, 09. */
+L180:
+	    if (corder) {
+		cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
+			ct, g, w, &c__0);
+	    }
+	    if (rorder) {
+		cchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
+			ct, g, w, &c__1);
+	    }
+	    goto L190;
+
+L190:
+	    if (fatal && sfatal) {
+		goto L210;
+	    }
+	}
+/* L200: */
+    }
+    printf("\nEND OF TESTS\n");
+    goto L230;
+
+L210:
+    printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n");
+    goto L230;
+
+L220:
+    printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n");
+    printf("****** TESTS ABANDONED ******\n");
+L230:
+    if (trace) {
+/*	cl__1.cerr = 0;
+	cl__1.cunit = ntra;
+	cl__1.csta = 0;
+	f_clos(&cl__1);*/
+    }
+/*    cl__1.cerr = 0;
+    cl__1.cunit = 6;
+    cl__1.csta = 0;
+    f_clos(&cl__1);
+    s_stop("", (ftnlen)0);*/
+     exit(0);
+
+/*     End of CBLAT3. */
+
+    return 0;
+} /* MAIN__ */
+
+/* Subroutine */ int cchk1_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex *
+	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, 
+	complex *cs, complex *ct, real *g, integer *iorder)
+{
+    /* Initialized data */
+
+    static char ich[3] = "NTC";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7, i__8;
+
+    /* Local variables */
+    complex beta;
+    integer ldas, ldbs, ldcs;
+    logical same, null;
+    integer i__, k, m, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    logical isame[13], trana, tranb;
+    integer nargs;
+    logical reset;
+    extern /* Subroutine */ int cprcn1_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, integer *, complex *, 
+	    integer *, integer *, complex *, integer *);
+    integer ia, ib, ma, mb, na, nb, nc, ik, im, in;
+    extern /* Subroutine */ int ccgemm3m_(integer *, char *, char *, integer *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *, complex *, complex *, integer *);
+    integer ks, ms, ns;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    char tranas[1], tranbs[1], transa[1], transb[1];
+    real errmax;
+    integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
+    extern logical lce_(complex *, complex *, integer *);
+    complex als, bls;
+    real err;
+
+/*  Tests CGEMM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+
+    nargs = 13;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDC to 1 more than minimum value if room. */
+	    ldc = m;
+	    if (ldc < *nmax) {
+		++ldc;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldc > *nmax) {
+		goto L100;
+	    }
+	    lcc = ldc * n;
+	    null = n <= 0 || m <= 0;
+
+	    i__3 = *nidim;
+	    for (ik = 1; ik <= i__3; ++ik) {
+		k = idim[ik];
+
+		for (ica = 1; ica <= 3; ++ica) {
+		    *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1]
+			    ;
+		    trana = *(unsigned char *)transa == 'T' || *(unsigned 
+			    char *)transa == 'C';
+
+		    if (trana) {
+			ma = k;
+			na = m;
+		    } else {
+			ma = m;
+			na = k;
+		    }
+/*                 Set LDA to 1 more than minimum value if room. */
+		    lda = ma;
+		    if (lda < *nmax) {
+			++lda;
+		    }
+/*                 Skip tests if not enough room. */
+		    if (lda > *nmax) {
+			goto L80;
+		    }
+		    laa = lda * na;
+
+/*                 Generate the matrix A. */
+
+		    cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[
+			    1], &lda, &reset, &c_b1);
+
+		    for (icb = 1; icb <= 3; ++icb) {
+			*(unsigned char *)transb = *(unsigned char *)&ich[icb 
+				- 1];
+			tranb = *(unsigned char *)transb == 'T' || *(unsigned 
+				char *)transb == 'C';
+
+			if (tranb) {
+			    mb = n;
+			    nb = k;
+			} else {
+			    mb = k;
+			    nb = n;
+			}
+/*                    Set LDB to 1 more than minimum value if room. */
+			ldb = mb;
+			if (ldb < *nmax) {
+			    ++ldb;
+			}
+/*                    Skip tests if not enough room. */
+			if (ldb > *nmax) {
+			    goto L70;
+			}
+			lbb = ldb * nb;
+
+/*                    Generate the matrix B. */
+
+			cmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, &
+				bb[1], &ldb, &reset, &c_b1);
+
+			i__4 = *nalf;
+			for (ia = 1; ia <= i__4; ++ia) {
+			    i__5 = ia;
+			    alpha.r = alf[i__5].r, alpha.i = alf[i__5].i;
+
+			    i__5 = *nbet;
+			    for (ib = 1; ib <= i__5; ++ib) {
+				i__6 = ib;
+				beta.r = bet[i__6].r, beta.i = bet[i__6].i;
+
+/*                          Generate the matrix C. */
+
+				cmake_("ge", " ", " ", &m, &n, &c__[c_offset],
+					 nmax, &cc[1], &ldc, &reset, &c_b1);
+
+				++nc;
+
+/*                          Save every datum before calling the */
+/*                          subroutine. */
+
+				*(unsigned char *)tranas = *(unsigned char *)
+					transa;
+				*(unsigned char *)tranbs = *(unsigned char *)
+					transb;
+				ms = m;
+				ns = n;
+				ks = k;
+				als.r = alpha.r, als.i = alpha.i;
+				i__6 = laa;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    as[i__7].r = aa[i__8].r, as[i__7].i = aa[
+					    i__8].i;
+/* L10: */
+				}
+				ldas = lda;
+				i__6 = lbb;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[
+					    i__8].i;
+/* L20: */
+				}
+				ldbs = ldb;
+				bls.r = beta.r, bls.i = beta.i;
+				i__6 = lcc;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[
+					    i__8].i;
+/* L30: */
+				}
+				ldcs = ldc;
+
+/*                          Call the subroutine. */
+
+				if (*trace) {
+				    cprcn1_(ntra, &nc, sname, iorder, transa, 
+					    transb, &m, &n, &k, &alpha, &lda, 
+					    &ldb, &beta, &ldc);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1); */
+				}
+				ccgemm3m_(iorder, transa, transb, &m, &n, &k, &
+					alpha, &aa[1], &lda, &bb[1], &ldb, &
+					beta, &cc[1], &ldc);
+
+/*                          Check if error-exit was taken incorrectly. */
+
+				if (! infoc_1.ok) {
+//				    io___128.ciunit = *nout;
+//				    s_wsfe(&io___128);
+//				    e_wsfe();
+				    printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				    *fatal = TRUE_;
+				    goto L120;
+				}
+
+/*                          See what data changed inside subroutines. */
+
+				isame[0] = *(unsigned char *)transa == *(
+					unsigned char *)tranas;
+				isame[1] = *(unsigned char *)transb == *(
+					unsigned char *)tranbs;
+				isame[2] = ms == m;
+				isame[3] = ns == n;
+				isame[4] = ks == k;
+				isame[5] = als.r == alpha.r && als.i == 
+					alpha.i;
+				isame[6] = lce_(&as[1], &aa[1], &laa);
+				isame[7] = ldas == lda;
+				isame[8] = lce_(&bs[1], &bb[1], &lbb);
+				isame[9] = ldbs == ldb;
+				isame[10] = bls.r == beta.r && bls.i == 
+					beta.i;
+				if (null) {
+				    isame[11] = lce_(&cs[1], &cc[1], &lcc);
+				} else {
+				    isame[11] = lceres_("ge", " ", &m, &n, &
+					    cs[1], &cc[1], &ldc);
+				}
+				isame[12] = ldcs == ldc;
+
+/*                          If data was incorrectly changed, report */
+/*                          and return. */
+
+				same = TRUE_;
+				i__6 = nargs;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    same = same && isame[i__ - 1];
+				    if (! isame[i__ - 1]) {
+    				printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);;
+				    }
+/* L40: */
+				}
+				if (! same) {
+				    *fatal = TRUE_;
+				    goto L120;
+				}
+
+				if (! null) {
+
+/*                             Check the result. */
+
+				    cmmch_(transa, transb, &m, &n, &k, &alpha,
+					     &a[a_offset], nmax, &b[b_offset],
+					     nmax, &beta, &c__[c_offset], 
+					    nmax, &ct[1], &g[1], &cc[1], &ldc,
+					     eps, &err, fatal, nout, &c_true);
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L120;
+				    }
+				}
+
+/* L50: */
+			    }
+
+/* L60: */
+			}
+
+L70:
+			;
+		    }
+
+L80:
+		    ;
+		}
+
+/* L90: */
+	    }
+
+L100:
+	    ;
+	}
+
+/* L110: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L130;
+
+L120:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    cprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, &
+	    lda, &ldb, &beta, &ldc);
+
+L130:
+    return 0;
+
+/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */
+/*     $     3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */
+/*     $     ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */
+
+/*     End of CCHK1. */
+
+} /* cchk1_ */
+
+
+/* Subroutine */ int cprcn1_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *transa, char *transb, integer *m, integer *n, integer *
+	k, complex *alpha, integer *lda, integer *ldb, complex *beta, integer 
+	*ldc)
+{
+    /* Local variables */
+    char crc[14], cta[14], ctb[14];
+
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(cta, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(cta, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transb == 'N') {
+	s_copy(ctb, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transb == 'T') {
+	s_copy(ctb, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb);
+    printf("%d %d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
+    return 0;
+} /* cprcn1_ */
+
+
+/* Subroutine */ int cchk2_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex *
+	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, 
+	complex *cs, complex *ct, real *g, integer *iorder)
+{
+    /* Initialized data */
+
+    static char ichs[2] = "LR";
+    static char ichu[2] = "UL";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+
+    /* Local variables */
+    complex beta;
+    integer ldas, ldbs, ldcs;
+    logical same;
+    char side[1];
+    logical conj, left, null;
+    char uplo[1];
+    integer i__, m, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    logical isame[13];
+    char sides[1];
+    integer nargs;
+    logical reset;
+    char uplos[1];
+    extern /* Subroutine */ int cprcn2_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, complex *, integer *, 
+	    integer *, complex *, integer *);
+    integer ia, ib, na, nc, im, in;
+    extern /* Subroutine */ int cchemm_(integer *, char *, char *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *);
+    integer ms, ns;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    extern /* Subroutine */ int ccsymm_(integer *, char *, char *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *);
+    real errmax;
+    integer laa, lbb, lda, lcc, ldb, ldc;
+    extern logical lce_(complex *, complex *, integer *);
+    integer ics;
+    complex als, bls;
+    integer icu;
+    real err;
+
+/*  Tests CHEMM and CSYMM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 12;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDC to 1 more than minimum value if room. */
+	    ldc = m;
+	    if (ldc < *nmax) {
+		++ldc;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldc > *nmax) {
+		goto L90;
+	    }
+	    lcc = ldc * n;
+	    null = n <= 0 || m <= 0;
+/*           Set LDB to 1 more than minimum value if room. */
+	    ldb = m;
+	    if (ldb < *nmax) {
+		++ldb;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldb > *nmax) {
+		goto L90;
+	    }
+	    lbb = ldb * n;
+
+/*           Generate the matrix B. */
+
+	    cmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &
+		    reset, &c_b1);
+
+	    for (ics = 1; ics <= 2; ++ics) {
+		*(unsigned char *)side = *(unsigned char *)&ichs[ics - 1];
+		left = *(unsigned char *)side == 'L';
+
+		if (left) {
+		    na = m;
+		} else {
+		    na = n;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = na;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L80;
+		}
+		laa = lda * na;
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+
+/*                 Generate the hermitian or symmetric matrix A. */
+
+		    cmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax,
+			     &aa[1], &lda, &reset, &c_b1);
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+
+/*                       Generate the matrix C. */
+
+			    cmake_("ge", " ", " ", &m, &n, &c__[c_offset], 
+				    nmax, &cc[1], &ldc, &reset, &c_b1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the */
+/*                       subroutine. */
+
+			    *(unsigned char *)sides = *(unsigned char *)side;
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    ms = m;
+			    ns = n;
+			    als.r = alpha.r, als.i = alpha.i;
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    i__5 = lbb;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldbs = ldb;
+			    bls.r = beta.r, bls.i = beta.i;
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L30: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (*trace) {
+				cprcn2_(ntra, &nc, sname, iorder, side, uplo, 
+					&m, &n, &alpha, &lda, &ldb, &beta, &
+					ldc)
+					;
+			    }
+			    if (*rewi) {
+/*				al__1.aerr = 0;
+				al__1.aunit = *ntra;
+				f_rew(&al__1);*/
+			    }
+			    if (conj) {
+				cchemm_(iorder, side, uplo, &m, &n, &alpha, &
+					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
+					1], &ldc);
+			    } else {
+				ccsymm_(iorder, side, uplo, &m, &n, &alpha, &
+					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
+					1], &ldc);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+    printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				*fatal = TRUE_;
+				goto L110;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)sides == *(unsigned 
+				    char *)side;
+			    isame[1] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[2] = ms == m;
+			    isame[3] = ns == n;
+			    isame[4] = als.r == alpha.r && als.i == alpha.i;
+			    isame[5] = lce_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    isame[7] = lce_(&bs[1], &bb[1], &lbb);
+			    isame[8] = ldbs == ldb;
+			    isame[9] = bls.r == beta.r && bls.i == beta.i;
+			    if (null) {
+				isame[10] = lce_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[10] = lceres_("ge", " ", &m, &n, &cs[1],
+					 &cc[1], &ldc);
+			    }
+			    isame[11] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+				printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+				}
+/* L40: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L110;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result. */
+
+				if (left) {
+				    cmmch_("N", "N", &m, &n, &m, &alpha, &a[
+					    a_offset], nmax, &b[b_offset], 
+					    nmax, &beta, &c__[c_offset], nmax,
+					     &ct[1], &g[1], &cc[1], &ldc, eps,
+					     &err, fatal, nout, &c_true);
+				} else {
+				    cmmch_("N", "N", &m, &n, &n, &alpha, &b[
+					    b_offset], nmax, &a[a_offset], 
+					    nmax, &beta, &c__[c_offset], nmax,
+					     &ct[1], &g[1], &cc[1], &ldc, eps,
+					     &err, fatal, nout, &c_true);
+				}
+				errmax = f2cmax(errmax,err);
+/*                          If got really bad answer, report and */
+/*                          return. */
+				if (*fatal) {
+				    goto L110;
+				}
+			    }
+
+/* L50: */
+			}
+
+/* L60: */
+		    }
+
+/* L70: */
+		}
+
+L80:
+		;
+	    }
+
+L90:
+	    ;
+	}
+
+/* L100: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L120;
+
+L110:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    cprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, 
+	    &beta, &ldc);
+
+L120:
+    return 0;
+
+/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */
+/*     $      ',', F4.1, '), C,', I3, ')    .' ) */
+
+/*     End of CCHK2. */
+
+} /* cchk2_ */
+
+
+/* Subroutine */ int cprcn2_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *side, char *uplo, integer *m, integer *n, complex *
+	alpha, integer *lda, integer *ldb, complex *beta, integer *ldc)
+{
+    /* Local variables */
+    char cs[14], cu[14], crc[14];
+
+    if (*(unsigned char *)side == 'L') {
+	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cs, "    CblasRight", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu);
+    printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
+    return 0;
+} /* cprcn2_ */
+
+
+/* Subroutine */ int cchk3_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nmax, complex *a, complex *aa, complex *as, complex *b, complex *bb, 
+	complex *bs, complex *ct, real *g, complex *c__, integer *iorder)
+{
+    /* Initialized data */
+
+    static char ichu[2] = "UL";
+    static char icht[3] = "NTC";
+    static char ichd[2] = "UN";
+    static char ichs[2] = "LR";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    complex q__1;
+
+    /* Local variables */
+    char diag[1];
+    integer ldas, ldbs;
+    logical same;
+    char side[1];
+    logical left, null;
+    char uplo[1];
+    integer i__, j, m, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    char diags[1];
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    logical isame[13];
+    char sides[1];
+    integer nargs;
+    logical reset;
+    char uplos[1];
+    extern /* Subroutine */ int cprcn3_(integer *, integer *, char *, integer 
+	    *, char *, char *, char *, char *, integer *, integer *, complex *
+	    , integer *, integer *);
+    integer ia, na, nc, im, in, ms, ns;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    extern /* Subroutine */ int cctrmm_(integer *, char *, char *, char *, 
+	    char *, integer *, integer *, complex *, complex *, integer *, 
+	    complex *, integer *);
+    char tranas[1], transa[1];
+    extern /* Subroutine */ int cctrsm_(integer *, char *, char *, char *, 
+	    char *, integer *, integer *, complex *, complex *, integer *, 
+	    complex *, integer *);
+    real errmax;
+    integer laa, icd, lbb, lda, ldb;
+    extern logical lce_(complex *, complex *, integer *);
+    integer ics;
+    complex als;
+    integer ict, icu;
+    real err;
+
+/*  Tests CTRMM and CTRSM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --g;
+    --ct;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+
+    nargs = 11;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+/*     Set up zero matrix for CMMCH. */
+    i__1 = *nmax;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *nmax;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    c__[i__3].r = 0.f, c__[i__3].i = 0.f;
+/* L10: */
+	}
+/* L20: */
+    }
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDB to 1 more than minimum value if room. */
+	    ldb = m;
+	    if (ldb < *nmax) {
+		++ldb;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldb > *nmax) {
+		goto L130;
+	    }
+	    lbb = ldb * n;
+	    null = m <= 0 || n <= 0;
+
+	    for (ics = 1; ics <= 2; ++ics) {
+		*(unsigned char *)side = *(unsigned char *)&ichs[ics - 1];
+		left = *(unsigned char *)side == 'L';
+		if (left) {
+		    na = m;
+		} else {
+		    na = n;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = na;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L130;
+		}
+		laa = lda * na;
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+
+		    for (ict = 1; ict <= 3; ++ict) {
+			*(unsigned char *)transa = *(unsigned char *)&icht[
+				ict - 1];
+
+			for (icd = 1; icd <= 2; ++icd) {
+			    *(unsigned char *)diag = *(unsigned char *)&ichd[
+				    icd - 1];
+
+			    i__3 = *nalf;
+			    for (ia = 1; ia <= i__3; ++ia) {
+				i__4 = ia;
+				alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+/*                          Generate the matrix A. */
+
+				cmake_("tr", uplo, diag, &na, &na, &a[
+					a_offset], nmax, &aa[1], &lda, &reset,
+					 &c_b1);
+
+/*                          Generate the matrix B. */
+
+				cmake_("ge", " ", " ", &m, &n, &b[b_offset], 
+					nmax, &bb[1], &ldb, &reset, &c_b1);
+
+				++nc;
+
+/*                          Save every datum before calling the */
+/*                          subroutine. */
+
+				*(unsigned char *)sides = *(unsigned char *)
+					side;
+				*(unsigned char *)uplos = *(unsigned char *)
+					uplo;
+				*(unsigned char *)tranas = *(unsigned char *)
+					transa;
+				*(unsigned char *)diags = *(unsigned char *)
+					diag;
+				ms = m;
+				ns = n;
+				als.r = alpha.r, als.i = alpha.i;
+				i__4 = laa;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    i__5 = i__;
+				    i__6 = i__;
+				    as[i__5].r = aa[i__6].r, as[i__5].i = aa[
+					    i__6].i;
+/* L30: */
+				}
+				ldas = lda;
+				i__4 = lbb;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    i__5 = i__;
+				    i__6 = i__;
+				    bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[
+					    i__6].i;
+/* L40: */
+				}
+				ldbs = ldb;
+
+/*                          Call the subroutine. */
+
+				if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen)
+					2) == 0) {
+				    if (*trace) {
+					cprcn3_(ntra, &nc, sname, iorder, 
+						side, uplo, transa, diag, &m, 
+						&n, &alpha, &lda, &ldb/*, (
+						ftnlen)12, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1*/);
+				    }
+				    if (*rewi) {
+/*					al__1.aerr = 0;
+					al__1.aunit = *ntra;
+					f_rew(&al__1);*/
+				    }
+				    cctrmm_(iorder, side, uplo, transa, diag, 
+					    &m, &n, &alpha, &aa[1], &lda, &bb[
+					    1], &ldb);
+				} else if (s_cmp(sname + 9, "sm", (ftnlen)2, (
+					ftnlen)2) == 0) {
+				    if (*trace) {
+					cprcn3_(ntra, &nc, sname, iorder, 
+						side, uplo, transa, diag, &m, 
+						&n, &alpha, &lda, &ldb/*, (
+						ftnlen)12, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1*/);
+				    }
+				    if (*rewi) {
+/*					al__1.aerr = 0;
+					al__1.aunit = *ntra;
+					f_rew(&al__1);*/
+				    }
+				    cctrsm_(iorder, side, uplo, transa, diag, 
+					    &m, &n, &alpha, &aa[1], &lda, &bb[
+					    1], &ldb);
+				}
+
+/*                          Check if error-exit was taken incorrectly. */
+
+				if (! infoc_1.ok) {
+				    printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				    *fatal = TRUE_;
+				    goto L150;
+				}
+
+/*                          See what data changed inside subroutines. */
+
+				isame[0] = *(unsigned char *)sides == *(
+					unsigned char *)side;
+				isame[1] = *(unsigned char *)uplos == *(
+					unsigned char *)uplo;
+				isame[2] = *(unsigned char *)tranas == *(
+					unsigned char *)transa;
+				isame[3] = *(unsigned char *)diags == *(
+					unsigned char *)diag;
+				isame[4] = ms == m;
+				isame[5] = ns == n;
+				isame[6] = als.r == alpha.r && als.i == 
+					alpha.i;
+				isame[7] = lce_(&as[1], &aa[1], &laa);
+				isame[8] = ldas == lda;
+				if (null) {
+				    isame[9] = lce_(&bs[1], &bb[1], &lbb);
+				} else {
+				    isame[9] = lceres_("ge", " ", &m, &n, &bs[
+					    1], &bb[1], &ldb);
+				}
+				isame[10] = ldbs == ldb;
+
+/*                          If data was incorrectly changed, report and */
+/*                          return. */
+
+				same = TRUE_;
+				i__4 = nargs;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    same = same && isame[i__ - 1];
+				    if (! isame[i__ - 1]) {
+					printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+				    }
+/* L50: */
+				}
+				if (! same) {
+				    *fatal = TRUE_;
+				    goto L150;
+				}
+
+				if (! null) {
+				    if (s_cmp(sname + 9, "mm", (ftnlen)2, (
+					    ftnlen)2) == 0) {
+
+/*                                Check the result. */
+
+					if (left) {
+					    cmmch_(transa, "N", &m, &n, &m, &
+						    alpha, &a[a_offset], nmax,
+						     &b[b_offset], nmax, &
+						    c_b1, &c__[c_offset], 
+						    nmax, &ct[1], &g[1], &bb[
+						    1], &ldb, eps, &err, 
+						    fatal, nout, &c_true/*, (
+						    ftnlen)1, (ftnlen)1*/);
+					} else {
+					    cmmch_("N", transa, &m, &n, &n, &
+						    alpha, &b[b_offset], nmax,
+						     &a[a_offset], nmax, &
+						    c_b1, &c__[c_offset], 
+						    nmax, &ct[1], &g[1], &bb[
+						    1], &ldb, eps, &err, 
+						    fatal, nout, &c_true);
+					}
+				    } else if (s_cmp(sname + 9, "sm", (ftnlen)
+					    2, (ftnlen)2) == 0) {
+
+/*                                Compute approximation to original */
+/*                                matrix. */
+
+					i__4 = n;
+					for (j = 1; j <= i__4; ++j) {
+					    i__5 = m;
+					    for (i__ = 1; i__ <= i__5; ++i__) 
+						    {
+			  i__6 = i__ + j * c_dim1;
+			  i__7 = i__ + (j - 1) * ldb;
+			  c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i;
+			  i__6 = i__ + (j - 1) * ldb;
+			  i__7 = i__ + j * b_dim1;
+			  q__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, 
+				  q__1.i = alpha.r * b[i__7].i + alpha.i * b[
+				  i__7].r;
+			  bb[i__6].r = q__1.r, bb[i__6].i = q__1.i;
+/* L60: */
+					    }
+/* L70: */
+					}
+
+					if (left) {
+					    cmmch_(transa, "N", &m, &n, &m, &
+						    c_b2, &a[a_offset], nmax, 
+						    &c__[c_offset], nmax, &
+						    c_b1, &b[b_offset], nmax, 
+						    &ct[1], &g[1], &bb[1], &
+						    ldb, eps, &err, fatal, 
+						    nout, &c_false);
+					} else {
+					    cmmch_("N", transa, &m, &n, &n, &
+						    c_b2, &c__[c_offset], 
+						    nmax, &a[a_offset], nmax, 
+						    &c_b1, &b[b_offset], nmax,
+						     &ct[1], &g[1], &bb[1], &
+						    ldb, eps, &err, fatal, 
+						    nout, &c_false);
+					}
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L150;
+				    }
+				}
+
+/* L80: */
+			    }
+
+/* L90: */
+			}
+
+/* L100: */
+		    }
+
+/* L110: */
+		}
+
+/* L120: */
+	    }
+
+L130:
+	    ;
+	}
+
+/* L140: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L160;
+
+L150:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    if (*trace) {
+	cprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &
+		alpha, &lda, &ldb);
+    }
+
+L160:
+    return 0;
+
+/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $     '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ')         ', */
+/*     $      '      .' ) */
+
+/*     End of CCHK3. */
+
+} /* cchk3_ */
+
+
+/* Subroutine */ int cprcn3_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *side, char *uplo, char *transa, char *diag, integer *m,
+	 integer *n, complex *alpha, integer *lda, integer *ldb)
+{
+    /* Local variables */
+    char ca[14], cd[14], cs[14], cu[14], crc[14];
+
+    if (*(unsigned char *)side == 'L') {
+	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cs, "    CblasRight", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)diag == 'N') {
+	s_copy(cd, "  CblasNonUnit", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cd, "     CblasUnit", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu);
+    printf("         %s %s %d %d (%4.1f,%4.1f) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb);
+
+    return 0;
+} /* cprcn3_ */
+
+
+/* Subroutine */ int cchk4_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nbet, complex *bet, integer *nmax, complex *a, complex *aa, complex *
+	as, complex *b, complex *bb, complex *bs, complex *c__, complex *cc, 
+	complex *cs, complex *ct, real *g, integer *iorder)
+{
+    /* Initialized data */
+
+    static char icht[2] = "NC";
+    static char ichu[2] = "UL";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    complex q__1;
+
+    /* Local variables */
+    complex beta;
+    integer ldas, ldcs;
+    logical same, conj;
+    complex bets;
+    real rals;
+    logical tran, null;
+    char uplo[1];
+    integer i__, j, k, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    real rbeta;
+    logical isame[13];
+    integer nargs;
+    real rbets;
+    logical reset;
+    char trans[1];
+    logical upper;
+    char uplos[1];
+    extern /* Subroutine */ int cprcn4_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, complex *, integer *, 
+	    complex *, integer *), cprcn6_(integer *, 
+	    integer *, char *, integer *, char *, char *, integer *, integer *
+	    , real *, integer *, real *, integer *);
+    integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks;
+    extern /* Subroutine */ int ccherk_(integer *, char *, char *, integer *, 
+	    integer *, real *, complex *, integer *, real *, complex *, 
+	    integer *);
+    integer ns;
+    real ralpha;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    real errmax;
+    extern /* Subroutine */ int ccsyrk_(integer *, char *, char *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, complex *, 
+	    integer *);
+    char transs[1], transt[1];
+    integer laa, lda, lcc, ldc;
+    extern logical lce_(complex *, complex *, integer *);
+    complex als;
+    integer ict, icu;
+    real err;
+
+/*  Tests CHERK and CSYRK. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 10;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+    rals = 1.f;
+    rbets = 1.f;
+
+    i__1 = *nidim;
+    for (in = 1; in <= i__1; ++in) {
+	n = idim[in];
+/*        Set LDC to 1 more than minimum value if room. */
+	ldc = n;
+	if (ldc < *nmax) {
+	    ++ldc;
+	}
+/*        Skip tests if not enough room. */
+	if (ldc > *nmax) {
+	    goto L100;
+	}
+	lcc = ldc * n;
+
+	i__2 = *nidim;
+	for (ik = 1; ik <= i__2; ++ik) {
+	    k = idim[ik];
+
+	    for (ict = 1; ict <= 2; ++ict) {
+		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
+		tran = *(unsigned char *)trans == 'C';
+		if (tran && ! conj) {
+		    *(unsigned char *)trans = 'T';
+		}
+		if (tran) {
+		    ma = k;
+		    na = n;
+		} else {
+		    ma = n;
+		    na = k;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = ma;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L80;
+		}
+		laa = lda * na;
+
+/*              Generate the matrix A. */
+
+		cmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], &
+			lda, &reset, &c_b1);
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+		    upper = *(unsigned char *)uplo == 'U';
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+			if (conj) {
+			    ralpha = alpha.r;
+			    q__1.r = ralpha, q__1.i = 0.f;
+			    alpha.r = q__1.r, alpha.i = q__1.i;
+			}
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+			    if (conj) {
+				rbeta = beta.r;
+				q__1.r = rbeta, q__1.i = 0.f;
+				beta.r = q__1.r, beta.i = q__1.i;
+			    }
+			    null = n <= 0;
+			    if (conj) {
+				null = null || ((k <= 0 || ralpha == 0.f) && 
+					rbeta == 1.f);
+			    }
+
+/*                       Generate the matrix C. */
+
+			    cmake_(sname + 7, uplo, " ", &n, &n, &c__[
+				    c_offset], nmax, &cc[1], &ldc, &reset, &
+				    c_b1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the subroutine. */
+
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    *(unsigned char *)transs = *(unsigned char *)
+				    trans;
+			    ns = n;
+			    ks = k;
+			    if (conj) {
+				rals = ralpha;
+			    } else {
+				als.r = alpha.r, als.i = alpha.i;
+			    }
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    if (conj) {
+				rbets = rbeta;
+			    } else {
+				bets.r = beta.r, bets.i = beta.i;
+			    }
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (conj) {
+				if (*trace) {
+				    cprcn6_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &ralpha, &lda, &
+					    rbeta, &ldc);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);*/
+				}
+				ccherk_(iorder, uplo, trans, &n, &k, &ralpha, 
+					&aa[1], &lda, &rbeta, &cc[1], &ldc);
+			    } else {
+				if (*trace) {
+				    cprcn4_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &
+					    beta, &ldc);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);*/
+				}
+				ccsyrk_(iorder, uplo, trans, &n, &k, &alpha, &
+					aa[1], &lda, &beta, &cc[1], &ldc);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+				printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				*fatal = TRUE_;
+				goto L120;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[1] = *(unsigned char *)transs == *(unsigned 
+				    char *)trans;
+			    isame[2] = ns == n;
+			    isame[3] = ks == k;
+			    if (conj) {
+				isame[4] = rals == ralpha;
+			    } else {
+				isame[4] = als.r == alpha.r && als.i == 
+					alpha.i;
+			    }
+			    isame[5] = lce_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    if (conj) {
+				isame[7] = rbets == rbeta;
+			    } else {
+				isame[7] = bets.r == beta.r && bets.i == 
+					beta.i;
+			    }
+			    if (null) {
+				isame[8] = lce_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[8] = lceres_(sname + 7, uplo, &n, &n, &
+					cs[1], &cc[1], &ldc);
+			    }
+			    isame[9] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+				    printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+				}
+/* L30: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L120;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result column by column. */
+
+				if (conj) {
+				    *(unsigned char *)transt = 'C';
+				} else {
+				    *(unsigned char *)transt = 'T';
+				}
+				jc = 1;
+				i__5 = n;
+				for (j = 1; j <= i__5; ++j) {
+				    if (upper) {
+					jj = 1;
+					lj = j;
+				    } else {
+					jj = j;
+					lj = n - j + 1;
+				    }
+				    if (tran) {
+					cmmch_(transt, "N", &lj, &c__1, &k, &
+						alpha, &a[jj * a_dim1 + 1], 
+						nmax, &a[j * a_dim1 + 1], 
+						nmax, &beta, &c__[jj + j * 
+						c_dim1], nmax, &ct[1], &g[1], 
+						&cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true);
+				    } else {
+					cmmch_("N", transt, &lj, &c__1, &k, &
+						alpha, &a[jj + a_dim1], nmax, 
+						&a[j + a_dim1], nmax, &beta, &
+						c__[jj + j * c_dim1], nmax, &
+						ct[1], &g[1], &cc[jc], &ldc, 
+						eps, &err, fatal, nout, &
+						c_true);
+				    }
+				    if (upper) {
+					jc += ldc;
+				    } else {
+					jc = jc + ldc + 1;
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L110;
+				    }
+/* L40: */
+				}
+			    }
+
+/* L50: */
+			}
+
+/* L60: */
+		    }
+
+/* L70: */
+		}
+
+L80:
+		;
+	    }
+
+/* L90: */
+	}
+
+L100:
+	;
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L130;
+
+L110:
+    if (n > 1) {
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d:\n",j);
+    }
+
+L120:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    if (conj) {
+	cprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, 
+		&rbeta, &ldc);
+    } else {
+	cprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		beta, &ldc);
+    }
+
+L130:
+    return 0;
+
+/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $     F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ')               ', */
+/*     $      '          .' ) */
+/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */
+/*     $      '), C,', I3, ')          .' ) */
+
+/*     End of CCHK4. */
+
+} /* cchk4_ */
+
+
+/* Subroutine */ int cprcn4_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, complex *
+	alpha, integer *lda, complex *beta, integer *ldc)
+{
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("(          %d %d (%4.1f,%4.1f) A %d (%4.1f,%4.1f) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc);
+    return 0;
+} /* cprcn4_ */
+
+
+
+/* Subroutine */ int cprcn6_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, real *
+	alpha, integer *lda, real *beta, integer *ldc)
+{
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("(          %d %d %4.1f A %d %4.1f C %d\n",*n,*k,*alpha,*lda,*beta,*ldc);
+    return 0;
+} /* cprcn6_ */
+
+
+/* Subroutine */ int cchk5_(char *sname, real *eps, real *thresh, integer *
+	nout, integer *ntra, logical *trace, logical *rewi, logical *fatal, 
+	integer *nidim, integer *idim, integer *nalf, complex *alf, integer *
+	nbet, complex *bet, integer *nmax, complex *ab, complex *aa, complex *
+	as, complex *bb, complex *bs, complex *c__, complex *cc, complex *cs, 
+	complex *ct, real *g, complex *w, integer *iorder)
+{
+    /* Initialized data */
+
+    static char icht[2] = "NC";
+    static char ichu[2] = "UL";
+
+
+    /* System generated locals */
+    integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
+    complex q__1, q__2;
+
+    /* Local variables */
+    integer jjab;
+    complex beta;
+    integer ldas, ldbs, ldcs;
+    logical same, conj;
+    complex bets;
+    logical tran, null;
+    char uplo[1];
+    integer i__, j, k, n;
+    extern /* Subroutine */ int cmake_(char *, char *, char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *, logical *, 
+	    complex *);
+    complex alpha;
+    extern /* Subroutine */ int cmmch_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *, complex *, real *, complex *, 
+	    integer *, real *, real *, logical *, integer *, logical *);
+    real rbeta;
+    logical isame[13];
+    integer nargs;
+    real rbets;
+    logical reset;
+    char trans[1];
+    logical upper;
+    char uplos[1];
+    extern /* Subroutine */ int cprcn5_(integer *, integer *, char *, integer 
+	    *, char *, char *, integer *, integer *, complex *, integer *, 
+	    integer *, complex *, integer *), cprcn7_(
+	    integer *, integer *, char *, integer *, char *, char *, integer *
+	    , integer *, complex *, integer *, integer *, real *, integer *);
+    integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns;
+    extern logical lceres_(char *, char *, integer *, integer *, complex *, 
+	    complex *, integer *);
+    real errmax;
+    char transs[1], transt[1];
+    extern /* Subroutine */ int ccher2k_(integer *, char *, char *, integer *,
+	     integer *, complex *, complex *, integer *, complex *, integer *,
+	     real *, complex *, integer *);
+    integer laa, lbb, lda, lcc, ldb, ldc;
+    extern logical lce_(complex *, complex *, integer *);
+    extern /* Subroutine */ int ccsyr2k_(integer *, char *, char *, integer *,
+	     integer *, complex *, complex *, integer *, complex *, integer *,
+	     complex *, complex *, integer *);
+    complex als;
+    integer ict, icu;
+    real err;
+
+/*  Tests CHER2K and CSYR2K. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --w;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    --as;
+    --aa;
+    --ab;
+
+    /* Function Body */
+    conj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 12;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.f;
+
+    i__1 = *nidim;
+    for (in = 1; in <= i__1; ++in) {
+	n = idim[in];
+/*        Set LDC to 1 more than minimum value if room. */
+	ldc = n;
+	if (ldc < *nmax) {
+	    ++ldc;
+	}
+/*        Skip tests if not enough room. */
+	if (ldc > *nmax) {
+	    goto L130;
+	}
+	lcc = ldc * n;
+
+	i__2 = *nidim;
+	for (ik = 1; ik <= i__2; ++ik) {
+	    k = idim[ik];
+
+	    for (ict = 1; ict <= 2; ++ict) {
+		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
+		tran = *(unsigned char *)trans == 'C';
+		if (tran && ! conj) {
+		    *(unsigned char *)trans = 'T';
+		}
+		if (tran) {
+		    ma = k;
+		    na = n;
+		} else {
+		    ma = n;
+		    na = k;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = ma;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L110;
+		}
+		laa = lda * na;
+
+/*              Generate the matrix A. */
+
+		if (tran) {
+		    i__3 = *nmax << 1;
+		    cmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], &
+			    lda, &reset, &c_b1);
+		} else {
+		    cmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], &
+			    lda, &reset, &c_b1);
+		}
+
+/*              Generate the matrix B. */
+
+		ldb = lda;
+		lbb = laa;
+		if (tran) {
+		    i__3 = *nmax << 1;
+		    cmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1]
+			    , &ldb, &reset, &c_b1);
+		} else {
+		    cmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax,
+			     &bb[1], &ldb, &reset, &c_b1);
+		}
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+		    upper = *(unsigned char *)uplo == 'U';
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+			    if (conj) {
+				rbeta = beta.r;
+				q__1.r = rbeta, q__1.i = 0.f;
+				beta.r = q__1.r, beta.i = q__1.i;
+			    }
+			    null = n <= 0;
+			    if (conj) {
+				null = null || ((k <= 0 || (alpha.r == 0.f && 
+					alpha.i == 0.f)) && rbeta == 1.f);
+			    }
+
+/*                       Generate the matrix C. */
+
+			    cmake_(sname + 7, uplo, " ", &n, &n, &c__[
+				    c_offset], nmax, &cc[1], &ldc, &reset, &
+				    c_b1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the subroutine. */
+
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    *(unsigned char *)transs = *(unsigned char *)
+				    trans;
+			    ns = n;
+			    ks = k;
+			    als.r = alpha.r, als.i = alpha.i;
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    i__5 = lbb;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldbs = ldb;
+			    if (conj) {
+				rbets = rbeta;
+			    } else {
+				bets.r = beta.r, bets.i = beta.i;
+			    }
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L30: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (conj) {
+				if (*trace) {
+				    cprcn7_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &ldb,
+					     &rbeta, &ldc);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);*/
+				}
+				ccher2k_(iorder, uplo, trans, &n, &k, &alpha, 
+					&aa[1], &lda, &bb[1], &ldb, &rbeta, &
+					cc[1], &ldc);
+			    } else {
+				if (*trace) {
+				    cprcn5_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &ldb,
+					     &beta, &ldc);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);*/
+				}
+				ccsyr2k_(iorder, uplo, trans, &n, &k, &alpha, 
+					&aa[1], &lda, &bb[1], &ldb, &beta, &
+					cc[1], &ldc);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+				printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				*fatal = TRUE_;
+				goto L150;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[1] = *(unsigned char *)transs == *(unsigned 
+				    char *)trans;
+			    isame[2] = ns == n;
+			    isame[3] = ks == k;
+			    isame[4] = als.r == alpha.r && als.i == alpha.i;
+			    isame[5] = lce_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    isame[7] = lce_(&bs[1], &bb[1], &lbb);
+			    isame[8] = ldbs == ldb;
+			    if (conj) {
+				isame[9] = rbets == rbeta;
+			    } else {
+				isame[9] = bets.r == beta.r && bets.i == 
+					beta.i;
+			    }
+			    if (null) {
+				isame[10] = lce_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[10] = lceres_("he", uplo, &n, &n, &cs[1]
+					, &cc[1], &ldc);
+			    }
+			    isame[11] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+				    printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+				}
+/* L40: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L150;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result column by column. */
+
+				if (conj) {
+				    *(unsigned char *)transt = 'C';
+				} else {
+				    *(unsigned char *)transt = 'T';
+				}
+				jjab = 1;
+				jc = 1;
+				i__5 = n;
+				for (j = 1; j <= i__5; ++j) {
+				    if (upper) {
+					jj = 1;
+					lj = j;
+				    } else {
+					jj = j;
+					lj = n - j + 1;
+				    }
+				    if (tran) {
+					i__6 = k;
+					for (i__ = 1; i__ <= i__6; ++i__) {
+					    i__7 = i__;
+					    i__8 = ((j - 1) << 1) * *nmax + k + 
+						    i__;
+					    q__1.r = alpha.r * ab[i__8].r - 
+						    alpha.i * ab[i__8].i, 
+						    q__1.i = alpha.r * ab[
+						    i__8].i + alpha.i * ab[
+						    i__8].r;
+					    w[i__7].r = q__1.r, w[i__7].i = 
+						    q__1.i;
+					    if (conj) {
+			  i__7 = k + i__;
+			  r_cnjg(&q__2, &alpha);
+			  i__8 = ((j - 1) << 1) * *nmax + i__;
+			  q__1.r = q__2.r * ab[i__8].r - q__2.i * ab[i__8].i, 
+				  q__1.i = q__2.r * ab[i__8].i + q__2.i * ab[
+				  i__8].r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+					    } else {
+			  i__7 = k + i__;
+			  i__8 = ((j - 1) << 1) * *nmax + i__;
+			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+					    }
+/* L50: */
+					}
+					i__6 = k << 1;
+					i__7 = *nmax << 1;
+					i__8 = *nmax << 1;
+					cmmch_(transt, "N", &lj, &c__1, &i__6,
+						 &c_b2, &ab[jjab], &i__7, &w[
+						1], &i__8, &beta, &c__[jj + j 
+						* c_dim1], nmax, &ct[1], &g[1]
+						, &cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true);
+				    } else {
+					i__6 = k;
+					for (i__ = 1; i__ <= i__6; ++i__) {
+					    if (conj) {
+			  i__7 = i__;
+			  r_cnjg(&q__2, &ab[(k + i__ - 1) * *nmax + j]);
+			  q__1.r = alpha.r * q__2.r - alpha.i * q__2.i, 
+				  q__1.i = alpha.r * q__2.i + alpha.i * 
+				  q__2.r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+			  i__7 = k + i__;
+			  i__8 = (i__ - 1) * *nmax + j;
+			  q__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, q__2.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  r_cnjg(&q__1, &q__2);
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+					    } else {
+			  i__7 = i__;
+			  i__8 = (k + i__ - 1) * *nmax + j;
+			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+			  i__7 = k + i__;
+			  i__8 = (i__ - 1) * *nmax + j;
+			  q__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, q__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = q__1.r, w[i__7].i = q__1.i;
+					    }
+/* L60: */
+					}
+					i__6 = k << 1;
+					i__7 = *nmax << 1;
+					cmmch_("N", "N", &lj, &c__1, &i__6, &
+						c_b2, &ab[jj], nmax, &w[1], &
+						i__7, &beta, &c__[jj + j * 
+						c_dim1], nmax, &ct[1], &g[1], 
+						&cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true);
+				    }
+				    if (upper) {
+					jc += ldc;
+				    } else {
+					jc = jc + ldc + 1;
+					if (tran) {
+					    jjab += *nmax << 1;
+					}
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L140;
+				    }
+/* L70: */
+				}
+			    }
+
+/* L80: */
+			}
+
+/* L90: */
+		    }
+
+/* L100: */
+		}
+
+L110:
+		;
+	    }
+
+/* L120: */
+	}
+
+L130:
+	;
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+	    printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L160;
+
+L140:
+    if (n > 1) {
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d:\n",j);
+    }
+
+L150:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    if (conj) {
+	cprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		ldb, &rbeta, &ldc);
+    } else {
+	cprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		ldb, &beta, &ldc);
+    }
+
+L160:
+    return 0;
+
+/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */
+/*     $      ', C,', I3, ')           .' ) */
+/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */
+/*     $      ',', F4.1, '), C,', I3, ')    .' ) */
+
+/*     End of CCHK5. */
+
+} /* cchk5_ */
+
+
+/* Subroutine */ int cprcn5_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, complex *
+	alpha, integer *lda, integer *ldb, complex *beta, integer *ldc)
+{
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("%d %d (%4.1f,%4.1f) , A, %d, B, %d, (%4.1f,%4.1f) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
+    return 0;
+} /* cprcn5_ */
+
+
+
+/* Subroutine */ int cprcn7_(integer *nout, integer *nc, char *sname, integer 
+	*iorder, char *uplo, char *transa, integer *n, integer *k, complex *
+	alpha, integer *lda, integer *ldb, real *beta, integer *ldc)
+{
+
+    /* Local variables */
+    char ca[14], cu[14], crc[14];
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("%d %d (%4.1f,%4.1f), A, %d, B, %d, %4.1f, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc);
+    return 0;
+} /* cprcn7_ */
+
+
+/* Subroutine */ int cmake_(char *type__, char *uplo, char *diag, integer *m, 
+	integer *n, complex *a, integer *nmax, complex *aa, integer *lda, 
+	logical *reset, complex *transl)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    real r__1;
+    complex q__1, q__2;
+
+    /* Local variables */
+    extern /* Complex */ VOID cbeg_(complex *, logical *);
+    integer ibeg, iend;
+    logical unit;
+    integer i__, j;
+    logical lower, upper;
+    integer jj;
+    logical gen, her, tri, sym;
+
+
+/*  Generates values for an M by N matrix A. */
+/*  Stores the values in the array AA in the data structure required */
+/*  by the routine, with unwanted elements set to rogue value. */
+
+/*  TYPE is 'ge', 'he', 'sy' or 'tr'. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --aa;
+
+    /* Function Body */
+    gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0;
+    her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0;
+    sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0;
+    tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0;
+    upper = (her || sym || tri) && *(unsigned char *)uplo == 'U';
+    lower = (her || sym || tri) && *(unsigned char *)uplo == 'L';
+    unit = tri && *(unsigned char *)diag == 'U';
+
+/*     Generate data in array A. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    if (gen || (upper && i__ <= j) || (lower && i__ >= j)) {
+		i__3 = i__ + j * a_dim1;
+		cbeg_(&q__2, reset);
+		q__1.r = q__2.r + transl->r, q__1.i = q__2.i + transl->i;
+		a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+		if (i__ != j) {
+/*                 Set some elements to zero */
+		    if (*n > 3 && j == *n / 2) {
+			i__3 = i__ + j * a_dim1;
+			a[i__3].r = 0.f, a[i__3].i = 0.f;
+		    }
+		    if (her) {
+			i__3 = j + i__ * a_dim1;
+			r_cnjg(&q__1, &a[i__ + j * a_dim1]);
+			a[i__3].r = q__1.r, a[i__3].i = q__1.i;
+		    } else if (sym) {
+			i__3 = j + i__ * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i;
+		    } else if (tri) {
+			i__3 = j + i__ * a_dim1;
+			a[i__3].r = 0.f, a[i__3].i = 0.f;
+		    }
+		}
+	    }
+/* L10: */
+	}
+	if (her) {
+	    i__2 = j + j * a_dim1;
+	    i__3 = j + j * a_dim1;
+	    r__1 = a[i__3].r;
+	    q__1.r = r__1, q__1.i = 0.f;
+	    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+	}
+	if (tri) {
+	    i__2 = j + j * a_dim1;
+	    i__3 = j + j * a_dim1;
+	    q__1.r = a[i__3].r + 1.f, q__1.i = a[i__3].i + 0.f;
+	    a[i__2].r = q__1.r, a[i__2].i = q__1.i;
+	}
+	if (unit) {
+	    i__2 = j + j * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+	}
+/* L20: */
+    }
+
+/*     Store elements in array AS in data structure required by routine. */
+
+    if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		i__4 = i__ + j * a_dim1;
+		aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i;
+/* L30: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10f, aa[i__3].i = 1e10f;
+/* L40: */
+	    }
+/* L50: */
+	}
+    } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__,
+	     "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen)
+	    2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (upper) {
+		ibeg = 1;
+		if (unit) {
+		    iend = j - 1;
+		} else {
+		    iend = j;
+		}
+	    } else {
+		if (unit) {
+		    ibeg = j + 1;
+		} else {
+		    ibeg = j;
+		}
+		iend = *n;
+	    }
+	    i__2 = ibeg - 1;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10f, aa[i__3].i = 1e10f;
+/* L60: */
+	    }
+	    i__2 = iend;
+	    for (i__ = ibeg; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		i__4 = i__ + j * a_dim1;
+		aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i;
+/* L70: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = iend + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10f, aa[i__3].i = 1e10f;
+/* L80: */
+	    }
+	    if (her) {
+		jj = j + (j - 1) * *lda;
+		i__2 = jj;
+		i__3 = jj;
+		r__1 = aa[i__3].r;
+		q__1.r = r__1, q__1.i = -1e10f;
+		aa[i__2].r = q__1.r, aa[i__2].i = q__1.i;
+	    }
+/* L90: */
+	}
+    }
+    return 0;
+
+/*     End of CMAKE. */
+
+} /* cmake_ */
+
+/* Subroutine */ int cmmch_(char *transa, char *transb, integer *m, integer *
+	n, integer *kk, complex *alpha, complex *a, integer *lda, complex *b, 
+	integer *ldb, complex *beta, complex *c__, integer *ldc, complex *ct, 
+	real *g, complex *cc, integer *ldcc, real *eps, real *err, logical *
+	fatal, integer *nout, logical *mv)
+{
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
+	    cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
+    real r__1, r__2, r__3, r__4, r__5, r__6;
+    complex q__1, q__2, q__3, q__4;
+
+    /* Local variables */
+    real erri;
+    integer i__, j, k;
+    logical trana, tranb, ctrana, ctranb;
+
+/*  Checks the results of the computational tests. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --ct;
+    --g;
+    cc_dim1 = *ldcc;
+    cc_offset = 1 + cc_dim1 * 1;
+    cc -= cc_offset;
+
+    /* Function Body */
+    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 
+	    'C';
+    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 
+	    'C';
+    ctrana = *(unsigned char *)transa == 'C';
+    ctranb = *(unsigned char *)transb == 'C';
+
+/*     Compute expected result, one column at a time, in CT using data */
+/*     in A, B and C. */
+/*     Compute gauges in G. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    ct[i__3].r = 0.f, ct[i__3].i = 0.f;
+	    g[i__] = 0.f;
+/* L10: */
+	}
+	if (! trana && ! tranb) {
+	    i__2 = *kk;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = *m;
+		for (i__ = 1; i__ <= i__3; ++i__) {
+		    i__4 = i__;
+		    i__5 = i__;
+		    i__6 = i__ + k * a_dim1;
+		    i__7 = k + j * b_dim1;
+		    q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, 
+			    q__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[
+			    i__7].r;
+		    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+			    q__2.i;
+		    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+		    i__4 = i__ + k * a_dim1;
+		    i__5 = k + j * b_dim1;
+		    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = r_imag(
+			    &a[i__ + k * a_dim1]), abs(r__2))) * ((r__3 = b[
+			    i__5].r, abs(r__3)) + (r__4 = r_imag(&b[k + j * 
+			    b_dim1]), abs(r__4)));
+/* L20: */
+		}
+/* L30: */
+	    }
+	} else if (trana && ! tranb) {
+	    if (ctrana) {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+			i__6 = k + j * b_dim1;
+			q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, 
+				q__2.i = q__3.r * b[i__6].i + q__3.i * b[i__6]
+				.r;
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+				q__2.i;
+			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			i__4 = k + i__ * a_dim1;
+			i__5 = k + j * b_dim1;
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+				r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * ((
+				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
+				&b[k + j * b_dim1]), abs(r__4)));
+/* L40: */
+		    }
+/* L50: */
+		}
+	    } else {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = k + i__ * a_dim1;
+			i__7 = k + j * b_dim1;
+			q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
+				.i, q__2.i = a[i__6].r * b[i__7].i + a[i__6]
+				.i * b[i__7].r;
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+				q__2.i;
+			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			i__4 = k + i__ * a_dim1;
+			i__5 = k + j * b_dim1;
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+				r_imag(&a[k + i__ * a_dim1]), abs(r__2))) * ((
+				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
+				&b[k + j * b_dim1]), abs(r__4)));
+/* L60: */
+		    }
+/* L70: */
+		}
+	    }
+	} else if (! trana && tranb) {
+	    if (ctranb) {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = i__ + k * a_dim1;
+			r_cnjg(&q__3, &b[j + k * b_dim1]);
+			q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, 
+				q__2.i = a[i__6].r * q__3.i + a[i__6].i * 
+				q__3.r;
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+				q__2.i;
+			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			i__4 = i__ + k * a_dim1;
+			i__5 = j + k * b_dim1;
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+				r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * ((
+				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
+				&b[j + k * b_dim1]), abs(r__4)));
+/* L80: */
+		    }
+/* L90: */
+		}
+	    } else {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = i__ + k * a_dim1;
+			i__7 = j + k * b_dim1;
+			q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
+				.i, q__2.i = a[i__6].r * b[i__7].i + a[i__6]
+				.i * b[i__7].r;
+			q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i + 
+				q__2.i;
+			ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			i__4 = i__ + k * a_dim1;
+			i__5 = j + k * b_dim1;
+			g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 = 
+				r_imag(&a[i__ + k * a_dim1]), abs(r__2))) * ((
+				r__3 = b[i__5].r, abs(r__3)) + (r__4 = r_imag(
+				&b[j + k * b_dim1]), abs(r__4)));
+/* L100: */
+		    }
+/* L110: */
+		}
+	    }
+	} else if (trana && tranb) {
+	    if (ctrana) {
+		if (ctranb) {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+			    r_cnjg(&q__4, &b[j + k * b_dim1]);
+			    q__2.r = q__3.r * q__4.r - q__3.i * q__4.i, 
+				    q__2.i = q__3.r * q__4.i + q__3.i * 
+				    q__4.r;
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+				    + q__2.i;
+			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
+				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
+/* L120: */
+			}
+/* L130: */
+		    }
+		} else {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    r_cnjg(&q__3, &a[k + i__ * a_dim1]);
+			    i__6 = j + k * b_dim1;
+			    q__2.r = q__3.r * b[i__6].r - q__3.i * b[i__6].i, 
+				    q__2.i = q__3.r * b[i__6].i + q__3.i * b[
+				    i__6].r;
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+				    + q__2.i;
+			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
+				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
+/* L140: */
+			}
+/* L150: */
+		    }
+		}
+	    } else {
+		if (ctranb) {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    i__6 = k + i__ * a_dim1;
+			    r_cnjg(&q__3, &b[j + k * b_dim1]);
+			    q__2.r = a[i__6].r * q__3.r - a[i__6].i * q__3.i, 
+				    q__2.i = a[i__6].r * q__3.i + a[i__6].i * 
+				    q__3.r;
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+				    + q__2.i;
+			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
+				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
+/* L160: */
+			}
+/* L170: */
+		    }
+		} else {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    i__6 = k + i__ * a_dim1;
+			    i__7 = j + k * b_dim1;
+			    q__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[
+				    i__7].i, q__2.i = a[i__6].r * b[i__7].i + 
+				    a[i__6].i * b[i__7].r;
+			    q__1.r = ct[i__5].r + q__2.r, q__1.i = ct[i__5].i 
+				    + q__2.i;
+			    ct[i__4].r = q__1.r, ct[i__4].i = q__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((r__1 = a[i__4].r, abs(r__1)) + (r__2 =
+				     r_imag(&a[k + i__ * a_dim1]), abs(r__2)))
+				     * ((r__3 = b[i__5].r, abs(r__3)) + (r__4 
+				    = r_imag(&b[j + k * b_dim1]), abs(r__4)));
+/* L180: */
+			}
+/* L190: */
+		    }
+		}
+	    }
+	}
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    i__4 = i__;
+	    q__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, q__2.i = 
+		    alpha->r * ct[i__4].i + alpha->i * ct[i__4].r;
+	    i__5 = i__ + j * c_dim1;
+	    q__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, q__3.i = 
+		    beta->r * c__[i__5].i + beta->i * c__[i__5].r;
+	    q__1.r = q__2.r + q__3.r, q__1.i = q__2.i + q__3.i;
+	    ct[i__3].r = q__1.r, ct[i__3].i = q__1.i;
+	    i__3 = i__ + j * c_dim1;
+	    g[i__] = ((r__1 = alpha->r, abs(r__1)) + (r__2 = r_imag(alpha), 
+		    abs(r__2))) * g[i__] + ((r__3 = beta->r, abs(r__3)) + (
+		    r__4 = r_imag(beta), abs(r__4))) * ((r__5 = c__[i__3].r, 
+		    abs(r__5)) + (r__6 = r_imag(&c__[i__ + j * c_dim1]), abs(
+		    r__6)));
+/* L200: */
+	}
+
+/*        Compute the error ratio for this result. */
+
+	*err = 0.f;
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    i__4 = i__ + j * cc_dim1;
+	    q__2.r = ct[i__3].r - cc[i__4].r, q__2.i = ct[i__3].i - cc[i__4]
+		    .i;
+	    q__1.r = q__2.r, q__1.i = q__2.i;
+	    erri = ((r__1 = q__1.r, abs(r__1)) + (r__2 = r_imag(&q__1), abs(
+		    r__2))) / *eps;
+	    if (g[i__] != 0.f) {
+		erri /= g[i__];
+	    }
+	    *err = f2cmax(*err,erri);
+	    if (*err * sqrt(*eps) >= 1.f) {
+		goto L230;
+	    }
+/* L210: */
+	}
+
+/* L220: */
+    }
+
+/*     If the loop completes, all results are at least half accurate. */
+    goto L250;
+
+/*     Report fatal error. */
+
+L230:
+    *fatal = TRUE_;
+    printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n");
+    printf("         EXPECTED RESULT                    COMPUTED RESULT\n");
+    i__1 = *m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (*mv) {
+            printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i);
+	} else {
+            printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i);
+	}
+/* L240: */
+    }
+    if (*n > 1) {
+    	printf("      THESE ARE THE RESULTS FOR COLUMN %d\n",j);
+    }
+
+L250:
+    return 0;
+
+
+/*     End of CMMCH. */
+
+} /* cmmch_ */
+
+logical lce_(complex *ri, complex *rj, integer *lr)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    logical ret_val;
+
+    /* Local variables */
+    integer i__;
+
+
+/*  Tests if two arrays are identical. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    --rj;
+    --ri;
+
+    /* Function Body */
+    i__1 = *lr;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) {
+	    goto L20;
+	}
+/* L10: */
+    }
+    ret_val = TRUE_;
+    goto L30;
+L20:
+    ret_val = FALSE_;
+L30:
+    return ret_val;
+
+/*     End of LCE. */
+
+} /* lce_ */
+
+logical lceres_(char *type__, char *uplo, integer *m, integer *n, complex *aa,
+	 complex *as, integer *lda)
+{
+    /* System generated locals */
+    integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
+    logical ret_val;
+
+    /* Local variables */
+    integer ibeg, iend, i__, j;
+    logical upper;
+
+
+/*  Tests if selected elements in two arrays are equal. */
+
+/*  TYPE is 'ge' or 'he' or 'sy'. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    /* Parameter adjustments */
+    as_dim1 = *lda;
+    as_offset = 1 + as_dim1 * 1;
+    as -= as_offset;
+    aa_dim1 = *lda;
+    aa_offset = 1 + aa_dim1 * 1;
+    aa -= aa_offset;
+
+    /* Function Body */
+    upper = *(unsigned char *)uplo == 'U';
+    if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *lda;
+	    for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__,
+	     "sy", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (upper) {
+		ibeg = 1;
+		iend = j;
+	    } else {
+		ibeg = j;
+		iend = *n;
+	    }
+	    i__2 = ibeg - 1;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L30: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = iend + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+/*   60 CONTINUE */
+    ret_val = TRUE_;
+    goto L80;
+L70:
+    ret_val = FALSE_;
+L80:
+    return ret_val;
+
+/*     End of LCERES. */
+
+} /* lceres_ */
+
+/* Complex */ VOID cbeg_(complex * ret_val, logical *reset)
+{
+    /* System generated locals */
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    static integer i__, j, ic, mi, mj;
+
+
+/*  Generates complex numbers as pairs of random numbers uniformly */
+/*  distributed between -0.5 and 0.5. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    if (*reset) {
+/*        Initialize local variables. */
+	mi = 891;
+	mj = 457;
+	i__ = 7;
+	j = 7;
+	ic = 0;
+	*reset = FALSE_;
+    }
+
+/*     The sequence of values of I or J is bounded between 1 and 999. */
+/*     If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */
+/*     If initial I or J = 4 or 8, the period will be 25. */
+/*     If initial I or J = 5, the period will be 10. */
+/*     IC is used to break up the period by skipping 1 value of I or J */
+/*     in 6. */
+
+    ++ic;
+L10:
+    i__ *= mi;
+    j *= mj;
+    i__ -= i__ / 1000 * 1000;
+    j -= j / 1000 * 1000;
+    if (ic >= 5) {
+	ic = 0;
+	goto L10;
+    }
+    r__1 = (i__ - 500) / 1001.f;
+    r__2 = (j - 500) / 1001.f;
+    q__1.r = r__1, q__1.i = r__2;
+     ret_val->r = q__1.r,  ret_val->i = q__1.i;
+    return ;
+
+/*     End of CBEG. */
+
+} /* cbeg_ */
+
+real sdiff_(real *x, real *y)
+{
+    /* System generated locals */
+    real ret_val;
+
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+    ret_val = *x - *y;
+    return ret_val;
+
+/*     End of SDIFF. */
+
+} /* sdiff_ */
+
+/* Main program alias */ /*int cblat3_ () { MAIN__ (); return 0; }*/
diff --git a/ctest/c_dblat1c.c b/ctest/c_dblat1c.c
index bf2f7a781..089dca4da 100644
--- a/ctest/c_dblat1c.c
+++ b/ctest/c_dblat1c.c
@@ -21,19 +21,6 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
-#ifdef _MSC_VER
-static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
-static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
-static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
-static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
-#else
-static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
-static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
-#endif
-#define pCf(z) (*_pCf(z))
-#define pCd(z) (*_pCd(z))
 typedef int logical;
 typedef short int shortlogical;
 typedef char logical1;
@@ -242,124 +229,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
 
 /* Common Block Declarations */
 
@@ -375,16 +244,16 @@ struct {
 static integer c__1 = 1;
 static doublereal c_b34 = 1.;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
     static doublereal sfac = 9.765625e-4;
 
     /* Local variables */
-    extern /* Subroutine */ int check0_(), check1_(), check2_(), check3_();
+    extern /* Subroutine */ int check0_(doublereal*), check1_(doublereal*), check2_(doublereal*), check3_(doublereal*);
     static integer ic;
-    extern /* Subroutine */ int header_();
+    extern /* Subroutine */ int header_(void);
 
 /*     Test program for the DOUBLE PRECISION Level 1 CBLAS. */
 /*     Based upon the original CBLAS test routine together with: */
@@ -431,7 +300,7 @@ static doublereal c_b34 = 1.;
 
 } /* MAIN__ */
 
-/* Subroutine */ int header_()
+/* Subroutine */ int header_(void)
 {
     /* Initialized data */
 
@@ -450,8 +319,7 @@ static doublereal c_b34 = 1.;
 
 } /* header_ */
 
-/* Subroutine */ int check0_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check0_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -464,7 +332,7 @@ doublereal *sfac;
 
     /* Local variables */
     static integer k;
-    extern /* Subroutine */ int drotgtest_(), stest1_();
+    extern /* Subroutine */ int drotgtest_(doublereal*,doublereal*,doublereal*,doublereal*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static doublereal sa, sb, sc, ss;
 
 /*     .. Parameters .. */
@@ -509,8 +377,7 @@ L40:
     return 0;
 } /* check0_ */
 
-/* Subroutine */ int check1_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check1_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -535,14 +402,14 @@ doublereal *sfac;
 
     /* Local variables */
     static integer i__;
-    extern doublereal dnrm2test_();
+    extern doublereal dnrm2test_(integer*, doublereal*, integer*);
     static doublereal stemp[1], strue[8];
-    extern /* Subroutine */ int stest_(), dscaltest_();
-    extern doublereal dasumtest_();
-    extern /* Subroutine */ int itest1_(), stest1_();
+    extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dscaltest_(integer*,doublereal*,doublereal*,integer*);
+    extern doublereal dasumtest_(integer*,doublereal*,integer*);
+    extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static doublereal sx[8];
     static integer np1;
-    extern integer idamaxtest_();
+    extern integer idamaxtest_(integer*,doublereal*,integer*);
     static integer len;
 
 /*     .. Parameters .. */
@@ -603,8 +470,7 @@ doublereal *sfac;
     return 0;
 } /* check1_ */
 
-/* Subroutine */ int check2_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check2_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -649,10 +515,10 @@ doublereal *sfac;
 
     /* Local variables */
     static integer lenx, leny;
-    extern doublereal ddottest_();
+    extern doublereal ddottest_(integer*,doublereal*,integer*,doublereal*,integer*);
     static integer i__, j, ksize;
-    extern /* Subroutine */ int stest_(), dcopytest_(), dswaptest_(), 
-	    daxpytest_(), stest1_();
+    extern /* Subroutine */ int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), dcopytest_(integer*,doublereal*,integer*,doublereal*,integer*), dswaptest_(integer*,doublereal*,integer*,doublereal*,integer*), 
+	    daxpytest_(integer*,doublereal*,doublereal*,integer*,doublereal*,integer*), stest1_(doublereal*,doublereal*,doublereal*,doublereal*);
     static integer ki, kn, mx, my;
     static doublereal sx[7], sy[7], stx[7], sty[7];
 
@@ -733,8 +599,7 @@ doublereal *sfac;
     return 0;
 } /* check2_ */
 
-/* Subroutine */ int check3_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check3_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -753,9 +618,9 @@ doublereal *sfac;
 	    ;
 
     /* Local variables */
-    extern /* Subroutine */ int drottest_();
+    extern /* Subroutine */ int drottest_(integer*,doublereal*,integer*,doublereal*,integer*,doublereal*,doublereal*);
     static integer i__, k, ksize;
-    extern /* Subroutine */int stest_(), drotmtest_();
+    extern /* Subroutine */int stest_(integer*,doublereal*,doublereal*,doublereal*,doublereal*), drotmtest_(integer*,doublereal*,integer*,doublereal*,integer*,doublereal*);
     static integer ki, kn;
     static doublereal dparam[5], sx[10], sy[10], stx[10], sty[10];
 
@@ -826,9 +691,7 @@ doublereal *sfac;
     return 0;
 } /* check3_ */
 
-/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac)
-integer *len;
-doublereal *scomp, *strue, *ssize, *sfac;
+/* Subroutine */ int stest_(integer* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac)
 {
     /* System generated locals */
     integer i__1;
@@ -836,7 +699,7 @@ doublereal *scomp, *strue, *ssize, *sfac;
 
     /* Local variables */
     static integer i__;
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(doublereal*,doublereal*);
     static doublereal sd;
 
 /*     ********************************* STEST ************************** */
@@ -892,11 +755,10 @@ L40:
 
 } /* stest_ */
 
-/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac)
-doublereal *scomp1, *strue1, *ssize, *sfac;
+/* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac)
 {
     static doublereal scomp[1], strue[1];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(integer*, doublereal*, doublereal*, doublereal*, doublereal*);
 
 /*     ************************* STEST1 ***************************** */
 
@@ -923,8 +785,7 @@ doublereal *scomp1, *strue1, *ssize, *sfac;
     return 0;
 } /* stest1_ */
 
-doublereal sdiff_(sa, sb)
-doublereal *sa, *sb;
+doublereal sdiff_(doublereal* sa, doublereal* sb)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -938,8 +799,7 @@ doublereal *sa, *sb;
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int itest1_(icomp, itrue)
-integer *icomp, *itrue;
+/* Subroutine */ int itest1_(integer* icomp, integer* itrue)
 {
     /* Local variables */
     static integer id;
@@ -1188,4 +1048,4 @@ doublereal *dparam;
     return 0;
 } /* drotm_ */
 
-#endif
\ No newline at end of file
+#endif
diff --git a/ctest/c_dblat2c.c b/ctest/c_dblat2c.c
index f94dbc1fe..547aa808e 100644
--- a/ctest/c_dblat2c.c
+++ b/ctest/c_dblat2c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -395,7 +272,7 @@ static integer c_n1 = -1;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
@@ -413,17 +290,21 @@ static logical c_false = FALSE_;
     static logical same;
     static integer ninc, nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int dchk1_(), dchk2_(), dchk3_(), dchk4_(), 
-	    dchk5_(), dchk6_();
+    extern /* Subroutine */ int dchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk6_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
     static doublereal a[4225]	/* was [65][65] */, g[65];
     static integer i__, j;
-    extern doublereal ddiff_();
+    extern doublereal ddiff_(doublereal*, doublereal*);
     static integer n;
     static logical fatal;
     static doublereal x[65], y[65], z__[130];
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static char snaps[32], trans[1];
     static integer isnum;
     static logical ltest[16];
@@ -437,11 +318,11 @@ static logical c_false = FALSE_;
     static char snamet[12];
     static doublereal thresh;
     static logical rorder;
-    extern /* Subroutine */ int cd2chke_();
+    extern /* Subroutine */ void cd2chke_(char*, ftnlen);
     static integer layout;
     static logical ltestt, tsterr;
     static doublereal alf[7];
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static integer inc[7], nkb;
     static doublereal bet[7],eps,err;
     char   tmpchar;
@@ -977,21 +858,7 @@ L240:
 
 } /* MAIN__ */
 
-/* Subroutine */ int dchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1007,10 +874,10 @@ ftnlen sname_len;
     static integer incx, incy;
     static logical full, tran, null;
     static integer i__, m, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys;
@@ -1018,13 +885,14 @@ ftnlen sname_len;
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns;
-    extern /* Subroutine */ int cdgbmv_(), cdgemv_();
-    extern logical lderes_();
+    extern /* Subroutine */ void cdgbmv_(integer*, char*, integer*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ void cdgemv_(integer*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static char ctrans[14];
     static doublereal errmax, transl;
     static char transs[1];
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, bls, err;
     static integer iku, kls, kus;
 
@@ -1429,21 +1297,7 @@ L140:
 
 } /* dchk1_ */
 
-/* Subroutine */ int dchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1460,10 +1314,10 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, k, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -1474,12 +1328,13 @@ ftnlen sname_len;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, iy, ns, lx, ly;
-    extern logical lderes_();
-    extern /* Subroutine */ int cdsbmv_(), cdspmv_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdsbmv_(integer*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ void cdspmv_(integer*, char*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
     static doublereal errmax, transl;
-    extern /* Subroutine */ int cdsymv_();
+    extern /* Subroutine */ void cdsymv_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen);
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, bls, err;
 
 
@@ -1882,17 +1737,7 @@ L130:
 
 } /* dchk2_ */
 
-/* Subroutine */ int dchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, xt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *xt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* xt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1911,10 +1756,10 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1], cdiag[14];
     static integer i__, k, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static char diags[1];
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
@@ -1924,16 +1769,19 @@ ftnlen sname_len;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, ns, lx;
-    extern logical lderes_();
-    extern /* Subroutine */ int cdtbmv_(), cdtbsv_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtbmv_(integer*, char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtbsv_(integer*, char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static char ctrans[14];
     static doublereal errmax;
-    extern /* Subroutine */ int cdtpmv_(), cdtrmv_();
+    extern /* Subroutine */ void cdtpmv_(integer*, char*, char*, char*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtrmv_(integer*, char*, char*, char*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static doublereal transl;
-    extern /* Subroutine */ int cdtpsv_(), cdtrsv_();
+    extern /* Subroutine */ void cdtpsv_(integer*, char*, char*, char*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtrsv_(integer*, char*, char*, char*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, icd, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static integer ict, icu;
     static doublereal err;
 
@@ -2388,19 +2236,7 @@ L130:
 
 } /* dchk3_ */
 
-/* Subroutine */ int dchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
@@ -2411,17 +2247,18 @@ ftnlen sname_len;
     static integer incx, incy;
     static logical null;
     static integer i__, j, m, n;
-    extern /* Subroutine */ int dmake_(), cdger_();
+    extern /* Subroutine */ void cdger_(integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, integer*);
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys, ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax, transl;
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, err;
 
 
@@ -2727,19 +2564,7 @@ L150:
 
 } /* dchk4_ */
 
-/* Subroutine */ int dchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2757,25 +2582,25 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, j, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int cdspr_();
+    extern /* Subroutine */ void cdspr_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs;
-    extern /* Subroutine */ int cdsyr_();
+    extern /* Subroutine */ void cdsyr_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen);
     static logical upper;
     static char uplos[1];
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, ns, lx;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax, transl;
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, err;
 
 
@@ -3096,19 +2921,7 @@ L130:
 
 } /* dchk5_ */
 
-/* Subroutine */ int dchk6_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublereal *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk6_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* x, doublereal* xx, doublereal* xs, doublereal* y, doublereal* yy, doublereal* ys, doublereal* yt, doublereal* g, doublereal* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -3125,24 +2938,25 @@ ftnlen sname_len;
     static logical full, null;
     static char uplo[1];
     static integer i__, j, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char* , char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, integer*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha, w[2];
     static logical isame[13];
-    extern /* Subroutine */ int dmvch_();
+    extern /* Subroutine */ int dmvch_(char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ int cdspr2_(), cdsyr2_();
+    extern /* Subroutine */ void cdspr2_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, ftnlen);
+    extern /* Subroutine */ void cdsyr2_(integer*, char*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen);
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, iy, ns, lx, ly;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax, transl;
     static integer laa, lda;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, err;
 
 /*  Tests DSYR2 and DSPR2. */
@@ -3508,25 +3322,13 @@ L170:
 
 } /* dchk6_ */
 
-/* Subroutine */ int dmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, 
-	ku, reset, transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-doublereal *a;
-integer *nmax;
-doublereal *aa;
-integer *lda, *kl, *ku;
-logical *reset;
-doublereal *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int dmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublereal* a, integer* nmax, doublereal* aa, integer* lda, integer* kl, integer* ku, logical* reset, doublereal* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
 
     /* Local variables */
-    extern doublereal dbeg_();
+    extern doublereal dbeg_(logical* );
     static integer ibeg, iend, ioff;
     static logical unit;
     static integer i__, j;
@@ -3752,28 +3554,14 @@ ftnlen diag_len;
 
 } /* dmake_ */
 
-/* Subroutine */ int dmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, 
-	incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len)
-char *trans;
-integer *m, *n;
-doublereal *alpha, *a;
-integer *nmax;
-doublereal *x;
-integer *incx;
-doublereal *beta, *y;
-integer *incy;
-doublereal *yt, *g, *yy, *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen trans_len;
+/* Subroutine */ int dmvch_(char* trans, integer* m, integer* n, doublereal* alpha, doublereal* a, integer* nmax, doublereal* x, integer* incx, doublereal* beta, doublereal* y, integer* incy, doublereal* yt, doublereal* g, doublereal* yy, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
     doublereal d__1;
 
     /* Builtin functions */
-    double sqrt();
+    double sqrt(double);
 
     /* Local variables */
     static doublereal erri;
@@ -3902,9 +3690,7 @@ L70:
 
 } /* dmvch_ */
 
-logical lde_(ri, rj, lr)
-doublereal *ri, *rj;
-integer *lr;
+logical lde_(doublereal* ri, doublereal* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1;
@@ -3949,13 +3735,7 @@ L30:
 
 } /* lde_ */
 
-logical lderes_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-doublereal *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lderes_(char* type__, char* uplo, integer* m, integer* n, doublereal* aa, doublereal* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2;
@@ -4042,8 +3822,7 @@ L80:
 
 } /* lderes_ */
 
-doublereal dbeg_(reset)
-logical *reset;
+doublereal dbeg_(logical* reset)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -4094,8 +3873,7 @@ L10:
 
 } /* dbeg_ */
 
-doublereal ddiff_(x, y)
-doublereal *x, *y;
+doublereal ddiff_(doublereal* x, doublereal* y)
 {
     /* System generated locals */
     doublereal ret_val;
diff --git a/ctest/c_dblat3c.c b/ctest/c_dblat3c.c
index 05d6b65b0..dc3d6f9e7 100644
--- a/ctest/c_dblat3c.c
+++ b/ctest/c_dblat3c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -393,7 +270,7 @@ static logical c_true = TRUE_;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program  MAIN__() */ int main()
+/* Main program  MAIN__() */ int main(void)
 {
     /* Initialized data */
 
@@ -403,25 +280,24 @@ static logical c_false = FALSE_;
     integer i__1, i__2, i__3;
     doublereal d__1;
 
-    /* Builtin functions */
-    integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), 
-	    e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe();
-    integer f_clos();
 
     /* Local variables */
     static integer nalf, idim[9];
     static logical same;
     static integer nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int dchk1_(), dchk2_(), dchk3_(), dchk4_(), 
-	    dchk5_();
+    extern /* Subroutine */ int dchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int dchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, doublereal*, integer*, ftnlen);
+/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* ab, doublereal* aa, doublereal* as, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, doublereal* w, integer* iorder, ftnlen sname_len);
     static doublereal c__[4225]	/* was [65][65] */, g[65];
     static integer i__, j;
-    extern doublereal ddiff_();
+    extern doublereal ddiff_(doublereal*, doublereal*);
     static integer n;
     static logical fatal;
     static doublereal w[130];
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical trace;
     static integer nidim;
     static char snaps[32];
@@ -433,11 +309,11 @@ static logical c_false = FALSE_;
     static char snamet[12], transa[1], transb[1];
     static doublereal thresh;
     static logical rorder;
-    extern /* Subroutine */ int cd3chke_();
+    extern /* Subroutine */ void cd3chke_(char*, ftnlen);
     static integer layout;
     static logical ltestt, tsterr;
     static doublereal alf[7];
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal bet[7], eps, err;
     char tmpchar;
 
@@ -907,21 +783,7 @@ L230:
 
 } /* MAIN__ */
 
-/* Subroutine */ int dchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *nmax;
-doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -931,29 +793,27 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static doublereal beta;
     static integer ldas, ldbs, ldcs;
     static logical same, null;
     static integer i__, k, m, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13], trana, tranb;
     static integer nargs;
     static logical reset;
-    extern /* Subroutine */ void dprcn1_();
+    extern /* Subroutine */ void dprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ia, ib, ma, mb, na, nb, nc, ik, im, in;
-    extern /* Subroutine */ int cdgemm_();
+    extern /* Subroutine */ void cdgemm_(integer*, char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static integer ks, ms, ns;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static char tranas[1], tranbs[1], transa[1], transb[1];
     static doublereal errmax;
     static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als, bls, err;
 
 /*  Tests DGEMM. */
@@ -1283,23 +1143,8 @@ L130:
 
 } /* dchk1_ */
 
-/* Subroutine */ void dprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k,
-	 alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *transa, *transb;
-integer *m, *n, *k;
-doublereal *alpha;
-integer *lda, *ldb;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ void dprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char crc[14], cta[14], ctb[14];
@@ -1328,21 +1173,7 @@ ftnlen transb_len;
 } /* dprcn1_ */
 
 
-/* Subroutine */ int dchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *nmax;
-doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1353,8 +1184,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static doublereal beta;
@@ -1364,21 +1193,21 @@ ftnlen sname_len;
     static logical left, null;
     static char uplo[1];
     static integer i__, m, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13];
     static char sides[1];
     static integer nargs;
     static logical reset;
     static char uplos[1];
-    extern /* Subroutine */ void dprcn2_();
+    extern /* Subroutine */ void dprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ia, ib, na, nc, im, in, ms, ns;
-    extern logical lderes_();
-    extern /* Subroutine */ int cdsymm_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdsymm_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
     static integer laa, lbb, lda, lcc, ldb, ldc;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static integer ics;
     static doublereal als, bls;
     static integer icu;
@@ -1692,23 +1521,8 @@ L120:
 } /* dchk2_ */
 
 
-/* Subroutine */ void dprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha,
-	 lda, ldb, beta, ldc, sname_len, side_len, uplo_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo;
-integer *m, *n;
-doublereal *alpha;
-integer *lda, *ldb;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
+/* Subroutine */ void dprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char cs[14], cu[14], crc[14];
@@ -1733,19 +1547,7 @@ ftnlen uplo_len;
 } /* dprcn2_ */
 
 
-/* Subroutine */ int dchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__,
-	 iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nmax;
-doublereal *a, *aa, *as, *b, *bb, *bs, *ct, *g, *c__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* ct, doublereal* g, doublereal* c__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1766,24 +1568,24 @@ ftnlen sname_len;
     static logical left, null;
     static char uplo[1];
     static integer i__, j, m, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
     static char diags[1];
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13];
     static char sides[1];
     static integer nargs;
     static logical reset;
     static char uplos[1];
-    extern /* Subroutine */ void dprcn3_();
+    extern /* Subroutine */ void dprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen);
     static integer ia, na, nc, im, in, ms, ns;
-    extern logical lderes_();
-    extern /* Subroutine */ int cdtrmm_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cdtrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
     static char tranas[1], transa[1];
-    extern /* Subroutine */ int cdtrsm_();
+    extern /* Subroutine */ void cdtrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
     static doublereal errmax;
     static integer laa, icd, lbb, lda, ldb;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static integer ics;
     static doublereal als;
     static integer ict, icu;
@@ -2165,24 +1967,8 @@ L160:
 } /* dchk3_ */
 
 
-/* Subroutine */ void dprcn3_(nout, nc, sname, iorder, side, uplo, transa, 
-	diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, 
-	transa_len, diag_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo, *transa, *diag;
-integer *m, *n;
-doublereal *alpha;
-integer *lda, *ldb;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
-ftnlen transa_len;
-ftnlen diag_len;
+/* Subroutine */ void dprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublereal* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cd[14], cs[14], cu[14], crc[14];
@@ -2219,21 +2005,7 @@ ftnlen diag_len;
 } /* dprcn3_ */
 
 
-/* Subroutine */ int dchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *nmax;
-doublereal *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* a, doublereal* aa, doublereal* as, doublereal* b, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2244,8 +2016,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static doublereal beta;
@@ -2255,23 +2025,23 @@ ftnlen sname_len;
     static logical tran, null;
     static char uplo[1];
     static integer i__, j, k, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13];
     static integer nargs;
     static logical reset;
     static char trans[1];
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ void dprcn4_();
+    extern /* Subroutine */ void dprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
-    extern /* Subroutine */ int cdsyrk_();
+    extern /* Subroutine */ void cdsyrk_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, lda, lcc, ldc;
-    extern logical lde_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
     static doublereal als;
     static integer ict, icu;
     static doublereal err;
@@ -2586,23 +2356,8 @@ L130:
 } /* dchk4_ */
 
 
-/* Subroutine */ void dprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublereal *alpha;
-integer *lda;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ void dprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -2629,21 +2384,7 @@ ftnlen transa_len;
 } /* dprcn4_ */
 
 
-/* Subroutine */ int dchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, 
-	c__, cc, cs, ct, g, w, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublereal *alf;
-integer *nbet;
-doublereal *bet;
-integer *nmax;
-doublereal *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct, *g, *w;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int dchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublereal* alf, integer* nbet, doublereal* bet, integer* nmax, doublereal* ab, doublereal* aa, doublereal* as, doublereal* bb, doublereal* bs, doublereal* c__, doublereal* cc, doublereal* cs, doublereal* ct, doublereal* g, doublereal* w, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2653,8 +2394,6 @@ ftnlen sname_len;
     /* System generated locals */
     integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static integer jjab;
@@ -2665,23 +2404,23 @@ ftnlen sname_len;
     static logical tran, null;
     static char uplo[1];
     static integer i__, j, k, n;
-    extern /* Subroutine */ int dmake_();
+    extern /* Subroutine */ int dmake_(char*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, logical*, doublereal*, ftnlen, ftnlen, ftnlen);
     static doublereal alpha;
-    extern /* Subroutine */ int dmmch_();
+    extern /* Subroutine */ int dmmch_(char*, char*, integer*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, doublereal*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical isame[13];
     static integer nargs;
     static logical reset;
     static char trans[1];
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ void dprcn5_();
+    extern /* Subroutine */ void dprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns;
-    extern logical lderes_();
+    extern logical lderes_(char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
     static char transs[1];
     static integer laa, lbb, lda, lcc, ldb, ldc;
-    extern logical lde_();
-    extern /* Subroutine */ int cdsyr2k_();
+    extern logical lde_(doublereal*, doublereal*, integer*);
+    extern /* Subroutine */ void cdsyr2k_(integer*, char*, char*, integer*, integer*, doublereal*, doublereal*, integer*, doublereal*, integer*, doublereal*, doublereal*, integer*, ftnlen, ftnlen);
     static doublereal als;
     static integer ict, icu;
     static doublereal err;
@@ -3048,23 +2787,8 @@ L160:
 } /* dchk5_ */
 
 
-/* Subroutine */ void dprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublereal *alpha;
-integer *lda, *ldb;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ void dprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -3091,25 +2815,13 @@ ftnlen transa_len;
 } /* dprcn5_ */
 
 
-/* Subroutine */ int dmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset,
-	 transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-doublereal *a;
-integer *nmax;
-doublereal *aa;
-integer *lda;
-logical *reset;
-doublereal *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int dmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublereal* a, integer* nmax, doublereal* aa, integer* lda, logical* reset, doublereal* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
 
     /* Local variables */
-    extern doublereal dbeg_();
+    extern doublereal dbeg_(logical*);
     static integer ibeg, iend;
     static logical unit;
     static integer i__, j;
@@ -3241,25 +2953,7 @@ ftnlen diag_len;
 
 } /* dmake_ */
 
-/* Subroutine */ int dmmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, 
-	beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, 
-	transa_len, transb_len)
-char *transa, *transb;
-integer *m, *n, *kk;
-doublereal *alpha, *a;
-integer *lda;
-doublereal *b;
-integer *ldb;
-doublereal *beta, *c__;
-integer *ldc;
-doublereal *ct, *g, *cc;
-integer *ldcc;
-doublereal *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ int dmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublereal* alpha, doublereal* a, integer* lda, doublereal* b, integer* ldb, doublereal* beta, doublereal* c__, integer* ldc, doublereal* ct, doublereal* g, doublereal* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
@@ -3267,8 +2961,7 @@ ftnlen transb_len;
     doublereal d__1, d__2;
 
     /* Builtin functions */
-    double sqrt();
-    integer s_wsfe(), e_wsfe(), do_fio();
+    double sqrt(double);
 
     /* Local variables */
     static doublereal erri;
@@ -3432,9 +3125,7 @@ L150:
 
 } /* dmmch_ */
 
-logical lde_(ri, rj, lr)
-doublereal *ri, *rj;
-integer *lr;
+logical lde_(doublereal* ri, doublereal* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1;
@@ -3481,13 +3172,7 @@ L30:
 
 } /* lde_ */
 
-logical lderes_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-doublereal *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lderes_(char* type__, char* uplo, integer* m, integer* n, doublereal* aa, doublereal* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2;
@@ -3576,8 +3261,7 @@ L80:
 
 } /* lderes_ */
 
-doublereal dbeg_(reset)
-logical *reset;
+doublereal dbeg_(logical* reset)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -3629,8 +3313,7 @@ L10:
 
 } /* dbeg_ */
 
-doublereal ddiff_(x, y)
-doublereal *x, *y;
+doublereal ddiff_(doublereal* x, doublereal* y)
 {
     /* System generated locals */
     doublereal ret_val;
diff --git a/ctest/c_sblat1c.c b/ctest/c_sblat1c.c
index 57e4707a9..7a81e04c1 100644
--- a/ctest/c_sblat1c.c
+++ b/ctest/c_sblat1c.c
@@ -21,19 +21,6 @@ typedef float real;
 typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
-#ifdef _MSC_VER
-static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
-static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
-static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
-static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
-#else
-static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
-static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
-#endif
-#define pCf(z) (*_pCf(z))
-#define pCd(z) (*_pCd(z))
 typedef int logical;
 typedef short int shortlogical;
 typedef char logical1;
@@ -242,250 +229,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-#if 0
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-#endif
 
 
 /* Common Block Declarations */
@@ -502,16 +245,16 @@ struct {
 static integer c__1 = 1;
 static real c_b34 = (float)1.;
 
-/* Main program */ int main ()
+/* Main program */ int main (void)
 {
     /* Initialized data */
 
     static real sfac = (float)9.765625e-4;
 
     /* Local variables */
-    extern /* Subroutine */ int check0_(), check1_(), check2_(), check3_();
+    extern /* Subroutine */ int check0_(real*), check1_(real*), check2_(real*), check3_(real*);
     static integer ic;
-    extern /* Subroutine */ int header_();
+    extern /* Subroutine */ int header_(void);
 
 /*     Test program for the REAL             Level 1 CBLAS. */
 /*     Based upon the original CBLAS test routine together with: */
@@ -557,7 +300,7 @@ static real c_b34 = (float)1.;
     exit(0);
 } /* MAIN__ */
 
-/* Subroutine */ int header_()
+/* Subroutine */ int header_(void)
 {
     /* Initialized data */
 
@@ -580,8 +323,7 @@ static real c_b34 = (float)1.;
 
 } /* header_ */
 
-/* Subroutine */ int check0_(sfac)
-real *sfac;
+/* Subroutine */ int check0_(real *sfac)
 {
     /* Initialized data */
 
@@ -600,7 +342,7 @@ real *sfac;
 
     /* Local variables */
     static integer k;
-    extern /* Subroutine */ int srotgtest_(), stest1_();
+    extern /* Subroutine */ int srotgtest_(real*,real*,real*,real*), stest1_(real*,real*,real*,real*);
     static real sa, sb, sc, ss;
 
 /*     .. Parameters .. */
@@ -645,8 +387,7 @@ L40:
     return 0;
 } /* check0_ */
 
-/* Subroutine */ int check1_(sfac)
-real *sfac;
+/* Subroutine */ int check1_(real* sfac)
 {
     /* Initialized data */
 
@@ -692,14 +433,14 @@ real *sfac;
 
     /* Local variables */
     static integer i__;
-    extern real snrm2test_();
+    extern real snrm2test_(integer*,real*,integer*);
     static real stemp[1], strue[8];
-    extern /* Subroutine */ int stest_(), sscaltest_();
-    extern real sasumtest_();
-    extern /* Subroutine */ int itest1_(), stest1_();
+    extern /* Subroutine */ int stest_(integer*, real*,real*,real*,real*), sscaltest_(integer*,real*,real*,integer*);
+    extern real sasumtest_(integer*,real*,integer*);
+    extern /* Subroutine */ int itest1_(integer*,integer*), stest1_(real*,real*,real*,real*);
     static real sx[8];
     static integer np1;
-    extern integer isamaxtest_();
+    extern integer isamaxtest_(integer*,real*,integer*);
     static integer len;
 
 
@@ -761,8 +502,7 @@ real *sfac;
     return 0;
 } /* check1_ */
 
-/* Subroutine */ int check2_(sfac)
-real *sfac;
+/* Subroutine */ int check2_(real* sfac)
 {
     /* Initialized data */
 
@@ -850,12 +590,12 @@ real *sfac;
 
     /* Local variables */
     static integer lenx, leny;
-    extern real sdottest_();
+    extern real sdottest_(integer*,real*,integer*,real*,integer*);
     static integer i__, j, ksize;
-    extern /* Subroutine */ int stest_(), scopytest_(), sswaptest_(), 
-	    saxpytest_();
+    extern /* Subroutine */ int stest_(integer*,real*,real*,real*,real*), scopytest_(integer*,real*,integer*,real*,integer*), sswaptest_(integer*,real*,integer*,real*,integer*), 
+	    saxpytest_(integer*,real*,real*,integer*,real*,integer*);
     static integer ki;
-    extern /* Subroutine */ int stest1_();
+    extern /* Subroutine */ int stest1_(real*,real*,real*,real*);
     static integer kn, mx, my;
     static real sx[7], sy[7], stx[7], sty[7];
 
@@ -936,8 +676,7 @@ real *sfac;
     return 0;
 } /* check2_ */
 
-/* Subroutine */ int check3_(sfac)
-real *sfac;
+/* Subroutine */ int check3_(real* sfac)
 {
     /* Initialized data */
 
@@ -969,9 +708,9 @@ real *sfac;
 	    1.17 };
 
     /* Local variables */
-    extern /* Subroutine */ void srottest_();
+    extern /* Subroutine */ void srottest_(integer*,real*,integer*,real*,integer*,real*,real*);
     static integer i__, k, ksize;
-    extern /* Subroutine */ int stest_(), srotmtest_();
+    extern /* Subroutine */ int stest_(integer*,real*,real*,real*,real*), srotmtest_(integer*,real*,integer*,real*,integer*,real*);
     static integer ki, kn;
     static real sx[19], sy[19], sparam[5], stx[19], sty[19];
 
@@ -1042,16 +781,14 @@ real *sfac;
     return 0;
 } /* check3_ */
 
-/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac)
-integer *len;
-real *scomp, *strue, *ssize, *sfac;
+/* Subroutine */ int stest_(integer* len, real* scomp, real* strue, real* ssize, real* sfac)
 {
     integer i__1;
     real r__1, r__2, r__3, r__4, r__5;
 
     /* Local variables */
     static integer i__;
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*,real*);
     static real sd;
 
 /*     ********************************* STEST ************************** */
@@ -1107,11 +844,10 @@ L40:
 
 } /* stest_ */
 
-/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac)
-real *scomp1, *strue1, *ssize, *sfac;
+/* Subroutine */ int stest1_(real* scomp1, real* strue1, real* ssize, real* sfac)
 {
     static real scomp[1], strue[1];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(integer*,real*,real*,real*,real*);
 
 /*     ************************* STEST1 ***************************** */
 
@@ -1138,8 +874,7 @@ real *scomp1, *strue1, *ssize, *sfac;
     return 0;
 } /* stest1_ */
 
-doublereal sdiff_(sa, sb)
-real *sa, *sb;
+doublereal sdiff_(real* sa, real* sb)
 {
     /* System generated locals */
     real ret_val;
@@ -1153,8 +888,7 @@ real *sa, *sb;
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int itest1_(icomp, itrue)
-integer *icomp, *itrue;
+/* Subroutine */ int itest1_(integer* icomp, integer* itrue)
 {
     /* Local variables */
     static integer id;
diff --git a/ctest/c_sblat2c.c b/ctest/c_sblat2c.c
index a646091f7..53295e0c7 100644
--- a/ctest/c_sblat2c.c
+++ b/ctest/c_sblat2c.c
@@ -242,255 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-#if 0
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -521,7 +272,7 @@ static integer c_n1 = -1;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
@@ -539,16 +290,20 @@ static logical c_false = FALSE_;
     static logical same;
     static integer ninc, nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int schk1_(), schk2_(), schk3_(), schk4_(), 
-	    schk5_(), schk6_();
+extern /* Subroutine */ int schk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, integer*, integer*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+extern /* Subroutine */ int schk6_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, real* z__, integer* iorder, ftnlen sname_len);
     static real a[4225]	/* was [65][65] */, g[65];
     static integer i__, j, n;
     static logical fatal;
     static real x[65], y[65], z__[130];
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*, real*);
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static char snaps[32], trans[1];
     static integer isnum;
     static logical ltest[16];
@@ -564,12 +319,12 @@ static logical c_false = FALSE_;
     static logical rorder;
     static integer layout;
     static logical ltestt;
-    extern /* Subroutine */ int cs2chke_();
+    extern /* Subroutine */ int cs2chke_(char*, ftnlen);
     static logical tsterr;
     static real alf[7];
     static integer inc[7], nkb;
     static real bet[7];
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real eps, err;
     char   tmpchar;
     
@@ -1098,21 +853,7 @@ L240:
 
 } /* MAIN__ */
 
-/* Subroutine */ int schk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, real* alf, integer* nbet, real* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1130,24 +871,25 @@ ftnlen sname_len;
     static integer i__, m, n;
     static real alpha;
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static integer incxs, incys;
     static char trans[1];
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns;
-    extern /* Subroutine */ int csgbmv_(), csgemv_();
+    extern /* Subroutine */ void csgbmv_(integer*, char*, integer*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ void csgemv_(integer*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen);
     static char ctrans[14];
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen ltype_len, ftnlen uplo_len);
     static real transl;
     static char transs[1];
     static integer laa, lda;
     static real als, bls;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
     static integer iku, kls, kus;
 
@@ -1552,21 +1294,7 @@ L140:
 
 } /* schk1_ */
 
-/* Subroutine */ int schk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, real* alf, integer* nbet, real* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1585,9 +1313,9 @@ ftnlen sname_len;
     static integer i__, k, n;
     static real alpha;
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
@@ -1598,13 +1326,14 @@ ftnlen sname_len;
     static logical packed;
     static integer nk, ks, ix, iy, ns, lx, ly;
     static real errmax;
-    extern logical lseres_();
-    extern /* Subroutine */ int cssbmv_();
+    extern logical lseres_(char* , char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cssbmv_(integer*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen);
     static real transl;
-    extern /* Subroutine */ int csspmv_(), cssymv_();
+    extern /* Subroutine */ void csspmv_(integer*, char*, integer*, real*, real*, real*, integer*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ void cssymv_(integer*, char*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen);
     static integer laa, lda;
     static real als, bls;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYMV, SSBMV and SSPMV. */
@@ -2003,17 +1732,7 @@ L130:
 
 } /* schk2_ */
 
-/* Subroutine */ int schk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, xt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *xt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* xt, real* g, real* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2034,9 +1753,9 @@ ftnlen sname_len;
     static integer i__, k, n;
     static char diags[1];
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs;
@@ -2047,14 +1766,17 @@ ftnlen sname_len;
     static integer nk, ks, ix, ns, lx;
     static char ctrans[14];
     static real errmax;
-    extern logical lseres_();
-    extern /* Subroutine */ int cstbmv_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstbmv_(integer*, char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
     static real transl;
-    extern /* Subroutine */ int cstbsv_();
+    extern /* Subroutine */ void cstbsv_(integer*, char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
-    extern /* Subroutine */ int cstpmv_(), cstrmv_(), cstpsv_(), cstrsv_();
+    extern /* Subroutine */ void cstpmv_(integer*, char*, char*, char*, integer*, real*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstrmv_(integer*, char*, char*, char*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstpsv_(integer*, char*, char*, char*, integer*, real*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstrsv_(integer*, char*, char*, char*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
     static integer laa, icd, lda, ict, icu;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests STRMV, STBMV, STPMV, STRSV, STBSV and STPSV. */
@@ -2508,19 +2230,7 @@ L130:
 
 } /* schk3_ */
 
-/* Subroutine */ int schk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, real* z__, integer* iorder, ftnlen sname_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6;
@@ -2533,17 +2243,18 @@ ftnlen sname_len;
     static integer i__, j, m, n;
     static real alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int smake_(), csger_();
+/* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void csger_(integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, integer*);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static integer incxs, incys, ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly;
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char* , char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static real transl;
     static integer laa, lda;
     static real als;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SGER. */
@@ -2848,19 +2559,7 @@ L150:
 
 } /* schk4_ */
 
-/* Subroutine */ int schk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, real* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2880,25 +2579,25 @@ ftnlen sname_len;
     static integer i__, j, n;
     static real alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs;
-    extern /* Subroutine */ int csspr_();
+    extern /* Subroutine */ void csspr_(integer*, char*, integer*, real*, real*, integer*, real*, ftnlen);
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ int cssyr_();
+    extern /* Subroutine */ void cssyr_(integer*, char*, integer*, real*, real*, integer*, real*, integer*, ftnlen);
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, ns, lx;
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static real transl;
     static integer laa, lda;
     static real als;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYR and SSPR. */
@@ -3218,19 +2917,7 @@ L130:
 
 } /* schk5_ */
 
-/* Subroutine */ int schk6_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *ninc, *inc, *nmax, *incmax;
-real *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt, *g, *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk6_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, real* a, real* aa, real* as, real* x, real* xx, real* xs, real* y, real* yy, real* ys, real* yt, real* g, real* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -3249,26 +2936,26 @@ ftnlen sname_len;
     static integer i__, j, n;
     static real alpha, w[2];
     static logical isame[13];
-    extern /* Subroutine */ int smake_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, integer*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int smvch_();
+    extern /* Subroutine */ int smvch_(char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, real*, real*, logical*, integer*, logical*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
     static logical upper;
     static char uplos[1];
     static integer ia, ja, ic;
-    extern /* Subroutine */ int csspr2_();
+    extern /* Subroutine */ void csspr2_(integer*, char*, integer*, real*, real*, integer*, real*, integer*, real*, ftnlen);
     static integer nc, jj, lj, in;
     static logical packed;
-    extern /* Subroutine */ int cssyr2_();
+    extern /* Subroutine */ void cssyr2_(integer*, char*, integer*, real*, real*, integer*, real*, integer*, real*, integer*, ftnlen);
     static integer ix, iy, ns, lx, ly;
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen ltype_len, ftnlen uplo_len);
     static real transl;
     static integer laa, lda;
     static real als;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYR2 and SSPR2. */
@@ -3634,26 +3321,14 @@ L170:
 
 } /* schk6_ */
 
-/* Subroutine */ int smake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, 
-	ku, reset, transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-real *a;
-integer *nmax;
-real *aa;
-integer *lda, *kl, *ku;
-logical *reset;
-real *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
-{
+/* Subroutine */ int smake_(char* type__, char* uplo, char* diag, integer* m, integer* n, real* a, integer* nmax, real* aa, integer* lda, integer* kl, integer* ku, logical* reset, real* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
+ {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
 
     /* Local variables */
     static integer ibeg, iend;
-    extern doublereal sbeg_();
+    extern doublereal sbeg_(logical*);
     static integer ioff;
     static logical unit;
     static integer i__, j;
@@ -3879,28 +3554,14 @@ ftnlen diag_len;
 
 } /* smake_ */
 
-/* Subroutine */ int smvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, 
-	incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len)
-char *trans;
-integer *m, *n;
-real *alpha, *a;
-integer *nmax;
-real *x;
-integer *incx;
-real *beta, *y;
-integer *incy;
-real *yt, *g, *yy, *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen trans_len;
+/* Subroutine */ int smvch_(char* trans, integer* m, integer* n, real* alpha, real* a, integer* nmax, real* x, integer* incx, real* beta, real* y, integer* incy, real* yt, real* g, real* yy, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
     real r__1;
 
     /* Builtin functions */
-    double sqrt();
+    double sqrt(double);
 
     /* Local variables */
     static real erri;
@@ -4029,9 +3690,7 @@ L70:
 
 } /* smvch_ */
 
-logical lse_(ri, rj, lr)
-real *ri, *rj;
-integer *lr;
+logical lse_(real* ri, real* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1;
@@ -4076,13 +3735,7 @@ L30:
 
 } /* lse_ */
 
-logical lseres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-real *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen ltype_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2;
@@ -4169,8 +3822,7 @@ L80:
 
 } /* lseres_ */
 
-doublereal sbeg_(reset)
-logical *reset;
+doublereal sbeg_(logical* reset)
 {
     /* System generated locals */
     real ret_val;
@@ -4221,8 +3873,7 @@ L10:
 
 } /* sbeg_ */
 
-doublereal sdiff_(x, y)
-real *x, *y;
+doublereal sdiff_(real* x, real* y)
 {
     /* System generated locals */
     real ret_val;
diff --git a/ctest/c_sblat3c.c b/ctest/c_sblat3c.c
index 73897a8a5..402c58c8b 100644
--- a/ctest/c_sblat3c.c
+++ b/ctest/c_sblat3c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -393,7 +270,7 @@ static logical c_true = TRUE_;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program  MAIN__() */ int main()
+/* Main program  MAIN__() */ int main(void)
 {
     /* Initialized data */
 
@@ -402,26 +279,25 @@ static logical c_false = FALSE_;
     /* System generated locals */
     integer i__1, i__2, i__3;
     real r__1;
-    /* Builtin functions */
-    integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), 
-	    e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe();
-    integer f_clos();
 
     /* Local variables */
     static integer nalf, idim[9];
     static logical same;
     static integer nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int schk1_(), schk2_(), schk3_(), schk4_(), 
-	    schk5_();
+    extern /* Subroutine */ int schk1_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int schk2_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int schk3_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int schk4_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
+    extern /* Subroutine */ int schk5_(char*, real*, real*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, real*, integer*, real*, integer*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, real*, integer*, ftnlen);
     static real c__[4225]	/* was [65][65] */, g[65];
     static integer i__, j, n;
     static logical fatal;
     static real w[130];
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(real*, real*);
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int smmch_();
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static char snaps[32];
     static integer isnum;
     static logical ltest[6];
@@ -433,9 +309,9 @@ static logical c_false = FALSE_;
     static logical rorder;
     static integer layout;
     static logical ltestt, tsterr;
-    extern /* Subroutine */ int cs3chke_();
+    extern /* Subroutine */ void cs3chke_(char*, ftnlen);
     static real alf[7], bet[7];
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real eps, err;
     char tmpchar;
 
@@ -899,21 +775,7 @@ L230:
 
 } /* MAIN__ */
 
-/* Subroutine */ int schk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *nmax;
-real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk1_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -923,8 +785,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5, i__6;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static real beta;
@@ -936,18 +796,17 @@ ftnlen sname_len;
     static logical trana, tranb;
     static integer nargs;
     static logical reset;
-    extern /* Subroutine */ void sprcn1_();
-    extern /* Subroutine */ int smake_();
-    extern /* Subroutine */ int smmch_();
+    extern /* Subroutine */ void sprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static integer ia, ib, ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
-    extern /* Subroutine */ int csgemm_();
+    extern /* Subroutine */ void csgemm_(integer*, char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static char tranas[1], tranbs[1], transa[1], transb[1];
     static real errmax;
-    extern logical lseres_();
-    extern logical lse_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern logical lse_(real*, real*, integer*);
     static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
     static real als, bls;
-    extern logical lse_();
     static real err;
 
 /*  Tests SGEMM. */
@@ -1278,23 +1137,8 @@ L130:
 
 
 
-/* Subroutine */ void sprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k,
-	 alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *transa, *transb;
-integer *m, *n, *k;
-real *alpha;
-integer *lda, *ldb;
-real *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ void sprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char crc[14], cta[14], ctb[14];
@@ -1324,21 +1168,7 @@ ftnlen transb_len;
 } /* sprcn1_ */
 
 
-/* Subroutine */ int schk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *nmax;
-real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk2_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1349,8 +1179,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static real beta;
@@ -1368,15 +1196,15 @@ ftnlen sname_len;
     static char uplos[1];
     static integer ia, ib, na, nc, im, in, ms, ns;
     static real errmax;
-    extern logical lseres_();
-    extern /* Subroutine */ int cssymm_();
-    extern void sprcn2_();
-    extern int smake_();
-    extern int smmch_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cssymm_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern void sprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static integer laa, lbb, lda, lcc, ldb, ldc, ics;
     static real als, bls;
     static integer icu;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYMM. */
@@ -1685,23 +1513,8 @@ L120:
 } /* schk2_ */
 
 
-/* Subroutine */ void sprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha,
-	 lda, ldb, beta, ldc, sname_len, side_len, uplo_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo;
-integer *m, *n;
-real *alpha;
-integer *lda, *ldb;
-real *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
+/* Subroutine */ void sprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char cs[14], cu[14], crc[14];
@@ -1726,19 +1539,7 @@ ftnlen uplo_len;
 } /* sprcn2_ */
 
 
-/* Subroutine */ int schk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__,
-	 iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nmax;
-real *a, *aa, *as, *b, *bb, *bs, *ct, *g, *c__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk3_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* ct, real* g, real* c__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1751,8 +1552,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static char diag[1];
@@ -1769,18 +1568,19 @@ ftnlen sname_len;
     static integer nargs;
     static logical reset;
     static char uplos[1];
-    extern /* Subroutine */ void sprcn3_();
+    extern /* Subroutine */ void sprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, real*, integer*, integer*, ftnlen , ftnlen, ftnlen, ftnlen, ftnlen);
     static integer ia, na, nc, im, in, ms, ns;
     static char tranas[1], transa[1];
     static real errmax;
-    extern int smake_();
-    extern int smmch_();
-    extern logical lseres_();
-    extern /* Subroutine */ int cstrmm_(), cstrsm_();
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstrmm_(integer*, char*, char*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cstrsm_(integer*, char*, char*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
     static integer laa, icd, lbb, lda, ldb, ics;
     static real als;
     static integer ict, icu;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests STRMM and STRSM. */
@@ -2155,24 +1955,8 @@ L160:
 } /* schk3_ */
 
 
-/* Subroutine */ void sprcn3_(nout, nc, sname, iorder, side, uplo, transa, 
-	diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, 
-	transa_len, diag_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo, *transa, *diag;
-integer *m, *n;
-real *alpha;
-integer *lda, *ldb;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
-ftnlen transa_len;
-ftnlen diag_len;
+/* Subroutine */ void sprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, real* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cd[14], cs[14], cu[14], crc[14];
@@ -2210,21 +1994,7 @@ ftnlen diag_len;
 } /* sprcn3_ */
 
 
-/* Subroutine */ int schk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *nmax;
-real *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct, *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk4_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* a, real* aa, real* as, real* b, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2235,8 +2005,6 @@ ftnlen sname_len;
     integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
 	    i__3, i__4, i__5;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static real beta;
@@ -2253,18 +2021,18 @@ ftnlen sname_len;
     static char trans[1];
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ void sprcn4_();
-    extern /* Subroutine */ int smake_();
-    extern /* Subroutine */ int smmch_();
+    extern /* Subroutine */ void sprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static integer ia, ib, jc, ma, na, nc, ik, in, jj, lj, ks, ns;
     static real errmax;
-    extern logical lseres_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static char transs[1];
-    extern /* Subroutine */ int cssyrk_();
+    extern /* Subroutine */ void cssyrk_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen);
     static integer laa, lda, lcc, ldc;
     static real als;
     static integer ict, icu;
-    extern logical lse_();
+    extern logical lse_(real*, real*, integer*);
     static real err;
 
 /*  Tests SSYRK. */
@@ -2575,23 +2343,8 @@ L130:
 } /* schk4_ */
 
 
-/* Subroutine */ void sprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-real *alpha;
-integer *lda;
-real *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ void sprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, real* alpha, integer* lda, real* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -2619,21 +2372,7 @@ ftnlen transa_len;
 } /* sprcn4_ */
 
 
-/* Subroutine */ int schk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, 
-	c__, cc, cs, ct, g, w, iorder, sname_len)
-char *sname;
-real *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-real *alf;
-integer *nbet;
-real *bet;
-integer *nmax;
-real *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct, *g, *w;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int schk5_(char* sname, real* eps, real* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, real* alf, integer* nbet, real* bet, integer* nmax, real* ab, real* aa, real* as, real* bb, real* bs, real* c__, real* cc, real* cs, real* ct, real* g, real* w, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2643,8 +2382,6 @@ ftnlen sname_len;
     /* System generated locals */
     integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
 
-    /* Builtin functions */
-    integer f_rew(), s_wsfe(), e_wsfe(), do_fio();
 
     /* Local variables */
     static integer jjab;
@@ -2663,18 +2400,18 @@ ftnlen sname_len;
     static logical upper;
     static char uplos[1];
     static integer ia, ib;
-    extern /* Subroutine */ void sprcn5_();
+    extern /* Subroutine */ void sprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, real*, integer*, integer*, real*, integer*, ftnlen, ftnlen, ftnlen);
     static integer jc, ma, na, nc, ik, in, jj, lj, ks, ns;
     static real errmax;
-    extern logical lseres_();
-    extern int smake_();
+    extern logical lseres_(char*, char*, integer*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ int smake_(char*, char*, char*, integer*, integer*, real*, integer*, real*, integer*, logical*, real*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, lbb, lda, lcc, ldb, ldc;
     static real als;
     static integer ict, icu;
-    extern /* Subroutine */ int cssyr2k_();
-    extern logical lse_();
-    extern int smmch_();
+    extern /* Subroutine */ void cssyr2k_(integer*, char*, char*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, ftnlen, ftnlen);
+    extern logical lse_(real*, real*, integer*);
+    extern /* Subroutine */ int smmch_(char*, char*, integer*, integer*, integer*, real*, real*, integer*, real*, integer*, real*, real*, integer*, real*, real*, real*, integer*, real*, real*, logical*, integer*, logical*, ftnlen, ftnlen);
     static real err;
 
 /*  Tests SSYR2K. */
@@ -3037,23 +2774,8 @@ L160:
 } /* schk5_ */
 
 
-/* Subroutine */ void sprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-real *alpha;
-integer *lda, *ldb;
-real *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ void sprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, real* alpha, integer* lda, integer* ldb, real* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
-    /* Builtin functions */
-    integer s_wsfe(), do_fio(), e_wsfe();
 
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -3081,19 +2803,7 @@ ftnlen transa_len;
 } /* sprcn5_ */
 
 
-/* Subroutine */ int smake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset,
-	 transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-real *a;
-integer *nmax;
-real *aa;
-integer *lda;
-logical *reset;
-real *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int smake_(char* type__, char* uplo, char* diag, integer* m, integer* n, real* a, integer* nmax, real* aa, integer* lda, logical* reset, real* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2;
@@ -3102,7 +2812,7 @@ ftnlen diag_len;
 
     /* Local variables */
     static integer ibeg, iend;
-    extern doublereal sbeg_();
+    extern doublereal sbeg_(logical*);
     static logical unit;
     static integer i__, j;
     static logical lower, upper, gen, tri, sym;
@@ -3233,25 +2943,7 @@ ftnlen diag_len;
 
 } /* smake_ */
 
-/* Subroutine */ int smmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, 
-	beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, 
-	transa_len, transb_len)
-char *transa, *transb;
-integer *m, *n, *kk;
-real *alpha, *a;
-integer *lda;
-real *b;
-integer *ldb;
-real *beta, *c__;
-integer *ldc;
-real *ct, *g, *cc;
-integer *ldcc;
-real *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ int smmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, real* alpha, real* a, integer* lda, real* b, integer* ldb, real* beta, real* c__, integer* ldc, real* ct, real* g, real* cc, integer* ldcc, real* eps, real* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len)
 {
 
     /* System generated locals */
@@ -3260,8 +2952,7 @@ ftnlen transb_len;
     real r__1, r__2;
 
     /* Builtin functions */
-    double sqrt();
-    integer s_wsfe(), e_wsfe(), do_fio();
+    double sqrt(double);
 
     /* Local variables */
     static real erri;
@@ -3426,9 +3117,7 @@ L150:
 
 } /* smmch_ */
 
-logical lse_(ri, rj, lr)
-real *ri, *rj;
-integer *lr;
+logical lse_(real* ri, real* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1;
@@ -3475,13 +3164,7 @@ L30:
 
 } /* lse_ */
 
-logical lseres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-real *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lseres_(char* type__, char* uplo, integer* m, integer* n, real* aa, real* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2;
@@ -3572,8 +3255,7 @@ L80:
 
 } /* lseres_ */
 
-doublereal sbeg_(reset)
-logical *reset;
+doublereal sbeg_(logical* reset)
 {
     /* System generated locals */
     real ret_val;
@@ -3625,8 +3307,7 @@ L10:
 
 } /* sbeg_ */
 
-doublereal sdiff_(x, y)
-real *x, *y;
+doublereal sdiff_(real* x, real* y)
 {
     /* System generated locals */
     real ret_val;
diff --git a/ctest/c_zblat1c.c b/ctest/c_zblat1c.c
index d5b080633..f7c0515fc 100644
--- a/ctest/c_zblat1c.c
+++ b/ctest/c_zblat1c.c
@@ -242,250 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-#if 0
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-#endif
 
 /* Common Block Declarations */
 
@@ -502,16 +258,16 @@ static integer c__1 = 1;
 static integer c__5 = 5;
 static doublereal c_b43 = 1.;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
     static doublereal sfac = 9.765625e-4;
 
     /* Local variables */
-    extern /* Subroutine */ int check1_(), check2_();
+    extern /* Subroutine */ int check1_(doublereal*), check2_(doublereal*);
     static integer ic;
-    extern /* Subroutine */ int header_();
+    extern /* Subroutine */ int header_(void);
 
 /*     Test program for the COMPLEX*16 Level 1 CBLAS. */
 /*     Based upon the original CBLAS test routine together with: */
@@ -551,7 +307,7 @@ static doublereal c_b43 = 1.;
     exit(0);
 } /* MAIN__ */
 
-/* Subroutine */ int header_()
+/* Subroutine */ int header_(void)
 {
     /* Initialized data */
 
@@ -570,8 +326,7 @@ static doublereal c_b43 = 1.;
 
 } /* header_ */
 
-/* Subroutine */ int check1_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check1_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -623,15 +378,15 @@ doublereal *sfac;
 
     /* Local variables */
     static integer i__;
-    extern /* Subroutine */ int ctest_();
+    extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
     static doublecomplex mwpcs[5], mwpct[5];
-    extern /* Subroutine */ int zscaltest_(), itest1_(), stest1_();
+    extern /* Subroutine */ int zscaltest_(integer*, doublereal*, doublecomplex*, integer*), itest1_(integer*, integer*), stest1_(doublereal*, doublereal*, doublereal*, doublereal*);
     static doublecomplex cx[8];
-    extern doublereal dznrm2test_();
+    extern doublereal dznrm2test_(integer*, doublecomplex*, integer*);
     static integer np1;
-    extern /* Subroutine */ int zdscaltest_();
-    extern integer izamaxtest_();
-    extern doublereal dzasumtest_();
+    extern /* Subroutine */ int zdscaltest_(integer*, doublereal*, doublecomplex*, integer*);
+    extern integer izamaxtest_(integer*, doublecomplex*, integer*);
+    extern doublereal dzasumtest_(integer*, doublecomplex*, integer*);
     static integer len;
 
 /*     .. Parameters .. */
@@ -748,8 +503,7 @@ doublereal *sfac;
     return 0;
 } /* check1_ */
 
-/* Subroutine */ int check2_(sfac)
-doublereal *sfac;
+/* Subroutine */ int check2_(doublereal* sfac)
 {
     /* Initialized data */
 
@@ -834,14 +588,14 @@ doublereal *sfac;
     /* Local variables */
     static doublecomplex cdot[1];
     static integer lenx, leny, i__;
-    extern /* Subroutine */ int ctest_();
+    extern /* Subroutine */ int ctest_(integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*);
     static integer ksize;
     static doublecomplex ztemp;
-    extern /* Subroutine */ int zdotctest_(), zcopytest_();
+    extern /* Subroutine */ int zdotctest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zcopytest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*);
     static integer ki;
-    extern /* Subroutine */ int zdotutest_(), zswaptest_();
+    extern /* Subroutine */ int zdotutest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*), zswaptest_(integer*, doublecomplex*, integer*, doublecomplex*, integer*);
     static integer kn;
-    extern /* Subroutine */ int zaxpytest_();
+    extern /* Subroutine */ int zaxpytest_(integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*);
     static doublecomplex cx[7], cy[7];
     static integer mx, my;
 
@@ -923,20 +677,18 @@ doublereal *sfac;
     return 0;
 } /* check2_ */
 
-/* Subroutine */ int stest_(len, scomp, strue, ssize, sfac)
-integer *len;
-doublereal *scomp, *strue, *ssize, *sfac;
+/* Subroutine */ int stest_(integer* len, doublereal* scomp, doublereal* strue, doublereal* ssize, doublereal* sfac)
 {
     /* System generated locals */
     integer i__1;
     doublereal d__1, d__2, d__3, d__4, d__5;
 
     /* Builtin functions */
-    integer s_wsfe(), e_wsfe(), do_fio();
+    integer s_wsfe(void), e_wsfe(void), do_fio(void);
 
     /* Local variables */
     static integer i__;
-    extern doublereal sdiff_();
+    extern doublereal sdiff_(doublereal*, doublereal*);
     static doublereal sd;
 
 /*     ********************************* STEST ************************** */
@@ -992,11 +744,10 @@ L40:
 
 } /* stest_ */
 
-/* Subroutine */ int stest1_(scomp1, strue1, ssize, sfac)
-doublereal *scomp1, *strue1, *ssize, *sfac;
+/* Subroutine */ int stest1_(doublereal* scomp1, doublereal* strue1, doublereal* ssize, doublereal* sfac)
 {
     static doublereal scomp[1], strue[1];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(integer*,doublereal*, doublereal*, doublereal*, doublereal*);
 
 /*     ************************* STEST1 ***************************** */
 
@@ -1023,8 +774,7 @@ doublereal *scomp1, *strue1, *ssize, *sfac;
     return 0;
 } /* stest1_ */
 
-doublereal sdiff_(sa, sb)
-doublereal *sa, *sb;
+doublereal sdiff_(doublereal* sa, doublereal* sb)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -1038,10 +788,7 @@ doublereal *sa, *sb;
     return ret_val;
 } /* sdiff_ */
 
-/* Subroutine */ int ctest_(len, ccomp, ctrue, csize, sfac)
-integer *len;
-doublecomplex *ccomp, *ctrue, *csize;
-doublereal *sfac;
+/* Subroutine */ int ctest_(integer* len, doublecomplex* ccomp, doublecomplex* ctrue, doublecomplex* csize, doublereal* sfac)
 {
     /* System generated locals */
     integer i__1, i__2;
@@ -1049,7 +796,7 @@ doublereal *sfac;
     /* Local variables */
     static integer i__;
     static doublereal scomp[20], ssize[20], strue[20];
-    extern /* Subroutine */ int stest_();
+    extern /* Subroutine */ int stest_(integer*, doublereal*, doublereal*, doublereal*, doublereal*);
 
 /*     **************************** CTEST ***************************** */
 
@@ -1087,8 +834,7 @@ doublereal *sfac;
     return 0;
 } /* ctest_ */
 
-/* Subroutine */ int itest1_(icomp, itrue)
-integer *icomp, *itrue;
+/* Subroutine */ int itest1_(integer* icomp, integer* itrue)
 {
     static integer id;
 
diff --git a/ctest/c_zblat2c.c b/ctest/c_zblat2c.c
index 92f03a045..8abd58394 100644
--- a/ctest/c_zblat2c.c
+++ b/ctest/c_zblat2c.c
@@ -242,129 +242,6 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 /* Common Block Declarations */
@@ -396,7 +273,7 @@ static integer c_n1 = -1;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program */ int main()
+/* Main program */ int main(void)
 {
     /* Initialized data */
 
@@ -414,19 +291,23 @@ static logical c_false = FALSE_;
     static logical same;
     static integer ninc, nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int zchk1_(), zchk2_(), zchk3_(), zchk4_(), 
-	    zchk5_(), zchk6_();
+    extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk6_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
     static doublecomplex a[4225]	/* was [65][65] */;
     static doublereal g[65];
     static integer i__, j;
-    extern doublereal ddiff_();
+    extern doublereal ddiff_(doublereal*, doublereal*);
     static integer n;
     static logical fatal;
     static doublecomplex x[65], y[65], z__[130];
     static logical trace;
     static integer nidim;
     static char snaps[32], trans[1];
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer isnum;
     static logical ltest[17];
     static doublecomplex aa[4225];
@@ -441,12 +322,12 @@ static logical c_false = FALSE_;
     static logical rorder;
     static integer layout;
     static logical ltestt, tsterr;
-    extern /* Subroutine */ int cz2chke_();
+    extern /* Subroutine */ void cz2chke_(char*, ftnlen);
     static doublecomplex alf[7];
     static integer inc[7], nkb;
     static doublecomplex bet[7];
     static doublereal eps, err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
     char tmpchar;
 
 /*  Test program for the DOUBLE PRECISION COMPLEX Level 2 Blas. */
@@ -984,22 +865,7 @@ L240:
 
 } /* MAIN__ */
 
-/* Subroutine */ int zchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1018,27 +884,27 @@ ftnlen sname_len;
     static integer i__, m, n;
     static doublecomplex alpha;
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys;
     static char trans[1];
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, nd, im, in, kl, ml, nk, nl, ku, ix, iy, ms, lx, ly, ns;
-    extern /* Subroutine */ int czgbmv_();
+    extern /* Subroutine */ void czgbmv_(integer*, char*, integer*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
     static char ctrans[14];
-    extern /* Subroutine */ int czgemv_();
+    extern /* Subroutine */ void czgemv_(integer*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
     static doublereal errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static char transs[1];
     static integer laa, lda;
     static doublecomplex als, bls;
     static doublereal err;
     static integer iku, kls;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
     static integer kus;
 
 
@@ -1451,22 +1317,7 @@ L140:
 
 } /* zchk1_ */
 
-/* Subroutine */ int zchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, nalf, alf, nbet, bet, ninc, inc, nmax, 
-	incmax, a, aa, as, x, xx, xs, y, yy, ys, yt, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1486,27 +1337,28 @@ ftnlen sname_len;
     static integer i__, k, n;
     static doublecomplex alpha;
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static char uplos[1];
     static integer ia, ib, ic;
     static logical banded;
     static integer nc, ik, in;
     static logical packed;
     static integer nk, ks, ix, iy, ns, lx, ly;
-    extern /* Subroutine */ int czhbmv_(), czhemv_();
+    extern /* Subroutine */ void czhbmv_(integer*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ void czhemv_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
     static doublereal errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
-    extern /* Subroutine */ int czhpmv_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void czhpmv_(integer*, char*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen);
     static integer laa, lda;
     static doublecomplex als, bls;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests CHEMV, CHBMV and CHPMV. */
 
@@ -1909,19 +1761,7 @@ L130:
 
 } /* zchk2_ */
 
-/* Subroutine */ int zchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nkb, kb, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, xt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nkb, *kb, *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *xt;
-doublereal *g;
-doublecomplex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nkb, integer* kb, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* xt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1942,13 +1782,13 @@ ftnlen sname_len;
     static integer i__, k, n;
     static char diags[1];
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs;
     static char trans[1];
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static char uplos[1];
     static logical banded;
     static integer nc, ik, in;
@@ -1957,14 +1797,17 @@ ftnlen sname_len;
     static char ctrans[14];
     static doublereal errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
-    extern /* Subroutine */ int cztbmv_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztbmv_(integer*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static char transs[1];
-    extern /* Subroutine */ int cztbsv_(), cztpmv_(), cztrmv_(), cztpsv_(), 
-	    cztrsv_();
+    extern /* Subroutine */ void cztbsv_(integer*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztpmv_(integer*, char*, char*, char*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztpsv_(integer*, char*, char*, char*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrmv_(integer*, char*, char*, char*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrsv_(integer*, char*, char*, char*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static integer laa, icd, lda, ict, icu;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 
 
@@ -2422,21 +2265,7 @@ L130:
 
 } /* zchk3_ */
 
-/* Subroutine */ int zchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-doublecomplex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
@@ -2450,21 +2279,21 @@ ftnlen sname_len;
     static integer i__, j, m, n;
     static doublecomplex alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static integer incxs, incys;
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static integer ia, nc, nd, im, in, ms, ix, iy, ns, lx, ly;
-    extern /* Subroutine */ int czgerc_();
+    extern /* Subroutine */ void czgerc_(integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*);
     static doublereal errmax;
-    extern /* Subroutine */ int czgeru_();
+    extern /* Subroutine */ void czgeru_(integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*);
     static doublecomplex transl;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lda;
     static doublecomplex als;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 
 
@@ -2793,21 +2622,7 @@ L150:
 
 } /* zchk4_ */
 
-/* Subroutine */ int zchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-doublecomplex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2827,13 +2642,14 @@ ftnlen sname_len;
     static integer i__, j, n;
     static doublecomplex alpha, w[1];
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int czher_();
+    extern /* Subroutine */ void czher_(integer*, char*, integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen);
     static logical reset;
     static char cuplo[14];
     static integer incxs;
-    extern /* Subroutine */ int czhpr_(), zmvch_();
+    extern /* Subroutine */ void czhpr_(integer*, char*, integer*, doublereal*, doublecomplex*, integer*, doublecomplex*, ftnlen);
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static logical upper;
     static char uplos[1];
     static integer ia, ja, ic, nc, jj, lj, in;
@@ -2841,10 +2657,10 @@ ftnlen sname_len;
     static integer ix, ns, lx;
     static doublereal ralpha, errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lda;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHER and ZHPR. */
 
@@ -3167,21 +2983,7 @@ L130:
 
 } /* zchk5_ */
 
-/* Subroutine */ int zchk6_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, ninc, inc, nmax, incmax, a, aa, as, x, 
-	xx, xs, y, yy, ys, yt, g, z__, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *ninc, *inc, *nmax, *incmax;
-doublecomplex *a, *aa, *as, *x, *xx, *xs, *y, *yy, *ys, *yt;
-doublereal *g;
-doublecomplex *z__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk6_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* ninc, integer* inc, integer* nmax, integer* incmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* x, doublecomplex* xx, doublecomplex* xs, doublecomplex* y, doublecomplex* yy, doublecomplex* ys, doublecomplex* yt, doublereal* g, doublecomplex* z__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -3201,25 +3003,26 @@ ftnlen sname_len;
     static integer i__, j, n;
     static doublecomplex alpha, w[2];
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, integer*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
     static logical reset;
     static char cuplo[14];
     static integer incxs, incys;
-    extern /* Subroutine */ int zmvch_();
+    extern /* Subroutine */ int zmvch_(char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen);
     static logical upper;
     static char uplos[1];
-    extern /* Subroutine */ int czher2_(), czhpr2_();
+    extern /* Subroutine */ void czher2_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ void czhpr2_(integer*, char*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, ftnlen);
     static integer ia, ja, ic, nc, jj, lj, in;
     static logical packed;
     static integer ix, iy, ns, lx, ly;
     static doublereal errmax;
     static doublecomplex transl;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lda;
     static doublecomplex als;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHER2 and ZHPR2. */
 
@@ -3604,24 +3407,7 @@ L170:
 
 } /* zchk6_ */
 
-/* Subroutine */ int zmvch_(trans, m, n, alpha, a, nmax, x, incx, beta, y, 
-	incy, yt, g, yy, eps, err, fatal, nout, mv, trans_len)
-char *trans;
-integer *m, *n;
-doublecomplex *alpha, *a;
-integer *nmax;
-doublecomplex *x;
-integer *incx;
-doublecomplex *beta, *y;
-integer *incy;
-doublecomplex *yt;
-doublereal *g;
-doublecomplex *yy;
-doublereal *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen trans_len;
+/* Subroutine */ int zmvch_(char* trans, integer* m, integer* n, doublecomplex* alpha, doublecomplex* a, integer* nmax, doublecomplex* x, integer* incx, doublecomplex* beta, doublecomplex* y, integer* incy, doublecomplex* yt, doublereal* g, doublecomplex* yy, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen trans_len)
 {
 
     /* System generated locals */
@@ -3819,9 +3605,7 @@ L80:
 
 } /* zmvch_ */
 
-logical lze_(ri, rj, lr)
-doublecomplex *ri, *rj;
-integer *lr;
+logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1, i__2, i__3;
@@ -3868,13 +3652,7 @@ L30:
 
 } /* lze_ */
 
-logical lzeres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-doublecomplex *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex* aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
@@ -3967,9 +3745,7 @@ L80:
 
 } /* lzeres_ */
 
-/* Double Complex */ VOID zbeg_( ret_val, reset)
-doublecomplex * ret_val;
-logical *reset;
+/* Double Complex */ VOID zbeg_( doublecomplex* ret_val, logical* reset)
 {
     /* System generated locals */
     doublereal d__1, d__2;
@@ -4030,8 +3806,7 @@ L10:
 
 } /* zbeg_ */
 
-doublereal ddiff_(x, y)
-doublereal *x, *y;
+doublereal ddiff_(doublereal* x, doublereal* y)
 {
     /* System generated locals */
     doublereal ret_val;
@@ -4051,19 +3826,7 @@ doublereal *x, *y;
 
 } /* ddiff_ */
 
-/* Subroutine */ int zmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, kl, 
-	ku, reset, transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-doublecomplex *a;
-integer *nmax;
-doublecomplex *aa;
-integer *lda, *kl, *ku;
-logical *reset;
-doublecomplex *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, integer* kl, integer* ku, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -4072,7 +3835,7 @@ ftnlen diag_len;
 
     /* Local variables */
     static integer ibeg, iend, ioff;
-    extern /* Double Complex */ VOID zbeg_();
+    extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*);
     static logical unit;
     static integer i__, j;
     static logical lower;
diff --git a/ctest/c_zblat3c.c b/ctest/c_zblat3c.c
index eca2c3ff6..6025c0052 100644
--- a/ctest/c_zblat3c.c
+++ b/ctest/c_zblat3c.c
@@ -22,14 +22,11 @@ typedef double doublereal;
 typedef struct { real r, i; } complex;
 typedef struct { doublereal r, i; } doublecomplex;
 #ifdef _MSC_VER
-static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
 static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
-static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
 static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
 #else
 static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
 static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
-static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
 static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
 #endif
 #define pCf(z) (*_pCf(z))
@@ -242,124 +239,7 @@ typedef struct Namelist Namelist;
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-#if 0
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-#endif
+
 
 /* Common Block Declarations */
 
@@ -388,7 +268,7 @@ static logical c_true = TRUE_;
 static integer c__0 = 0;
 static logical c_false = FALSE_;
 
-/* Main program  MAIN__() */ int main()
+/* Main program  MAIN__() */ int main(void)
 {
     /* Initialized data */
 
@@ -400,26 +280,29 @@ static logical c_false = FALSE_;
     doublereal d__1;
 
     /* Builtin functions */
-    integer s_rsle(), do_lio(), e_rsle(), f_open(), s_wsfe(), do_fio(), 
-	    e_wsfe(), s_wsle(), e_wsle(), s_rsfe(), e_rsfe();
+    integer s_rsle(void), do_lio(void), e_rsle(void), f_open(void), s_wsfe(void), do_fio(void), 
+	    e_wsfe(void), s_wsle(void), e_wsle(void), s_rsfe(void), e_rsfe(void);
 
     /* Local variables */
     static integer nalf, idim[9];
     static logical same;
     static integer nbet, ntra;
     static logical rewi;
-    extern /* Subroutine */ int zchk1_(), zchk2_(), zchk3_(), zchk4_(), 
-	    zchk5_();
+    extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
     static doublecomplex c__[4225]	/* was [65][65] */;
     static doublereal g[65];
     static integer i__, j;
-    extern doublereal ddiff_();
+    extern doublereal ddiff_(doublereal*, doublereal*);
     static integer n;
     static logical fatal;
     static doublecomplex w[130];
     static logical trace;
     static integer nidim;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static char snaps[32];
     static integer isnum;
     static logical ltest[9];
@@ -431,10 +314,10 @@ static logical c_false = FALSE_;
     static logical rorder;
     static integer layout;
     static logical ltestt, tsterr;
-    extern /* Subroutine */ int cz3chke_();
+    extern /* Subroutine */ int cz3chke_(char*, ftnlen);
     static doublecomplex alf[7], bet[7];
     static doublereal eps, err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
     char tmpchar;
     
 /*  Test program for the COMPLEX*16          Level 3 Blas. */
@@ -924,22 +807,7 @@ L230:
 
 } /* MAIN__ */
 
-/* Subroutine */ int zchk1_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *nmax;
-doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -956,21 +824,21 @@ ftnlen sname_len;
     static integer i__, k, m, n;
     static doublecomplex alpha;
     static logical isame[13], trana, tranb;
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical reset;
     static integer ia, ib;
-    extern /* Subroutine */ int zprcn1_();
+    extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
-    extern /* Subroutine */ int czgemm_();
+    extern /* Subroutine */ void czgemm_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static char tranas[1], tranbs[1], transa[1], transb[1];
     static doublereal errmax;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
     static doublecomplex als, bls;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZGEMM. */
 
@@ -1313,20 +1181,7 @@ L130:
 } /* zchk1_ */
 
 
-/* Subroutine */ int zprcn1_(nout, nc, sname, iorder, transa, transb, m, n, k,
-	 alpha, lda, ldb, beta, ldc, sname_len, transa_len, transb_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *transa, *transb;
-integer *m, *n, *k;
-doublecomplex *alpha;
-integer *lda, *ldb;
-doublecomplex *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len)
 {
     /* Local variables */
     static char crc[14], cta[14], ctb[14];
@@ -1357,22 +1212,7 @@ return 0;
 } /* zprcn1_ */
 
 
-/* Subroutine */ int zchk2_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *nmax;
-doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1394,23 +1234,23 @@ ftnlen sname_len;
     static doublecomplex alpha;
     static logical isame[13];
     static char sides[1];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical reset;
     static char uplos[1];
     static integer ia, ib;
-    extern /* Subroutine */ int zprcn2_();
+    extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static integer na, nc, im, in, ms, ns;
-    extern /* Subroutine */ int czhemm_();
+    extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
-    extern logical lzeres_();
-    extern /* Subroutine */ int czsymm_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lbb, lda, lcc, ldb, ldc, ics;
     static doublecomplex als, bls;
     static integer icu;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHEMM and ZSYMM. */
 
@@ -1737,20 +1577,7 @@ L120:
 } /* zchk2_ */
 
 
-/* Subroutine */ int zprcn2_(nout, nc, sname, iorder, side, uplo, m, n, alpha,
-	 lda, ldb, beta, ldc, sname_len, side_len, uplo_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo;
-integer *m, *n;
-doublecomplex *alpha;
-integer *lda, *ldb;
-doublecomplex *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
+/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len)
 {
     /* Local variables */
     static char cs[14], cu[14], crc[14];
@@ -1777,21 +1604,7 @@ return 0;
 } /* zprcn2_ */
 
 
-/* Subroutine */ int zchk3_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nmax, a, aa, as, b, bb, bs, ct, g, c__,
-	 iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nmax;
-doublecomplex *a, *aa, *as, *b, *bb, *bs, *ct;
-doublereal *g;
-doublecomplex *c__;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -1817,23 +1630,24 @@ ftnlen sname_len;
     static char diags[1];
     static logical isame[13];
     static char sides[1];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static logical reset;
     static char uplos[1];
     static integer ia, na;
-    extern /* Subroutine */ int zprcn3_();
+    extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen);
     static integer nc, im, in, ms, ns;
     static char tranas[1], transa[1];
     static doublereal errmax;
-    extern logical lzeres_();
-    extern /* Subroutine */ int cztrmm_(), cztrsm_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
     static integer laa, icd, lbb, lda, ldb, ics;
     static doublecomplex als;
     static integer ict, icu;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZTRMM and ZTRSM. */
 
@@ -2227,21 +2041,7 @@ L160:
 } /* zchk3_ */
 
 
-/* Subroutine */ int zprcn3_(nout, nc, sname, iorder, side, uplo, transa, 
-	diag, m, n, alpha, lda, ldb, sname_len, side_len, uplo_len, 
-	transa_len, diag_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *side, *uplo, *transa, *diag;
-integer *m, *n;
-doublecomplex *alpha;
-integer *lda, *ldb;
-ftnlen sname_len;
-ftnlen side_len;
-ftnlen uplo_len;
-ftnlen transa_len;
-ftnlen diag_len;
+/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len)
 {
 
     /* Local variables */
@@ -2281,22 +2081,7 @@ return 0;
 } /* zprcn3_ */
 
 
-/* Subroutine */ int zchk4_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, a, aa, as, b, bb, bs, 
-	c__, cc, cs, ct, g, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *nmax;
-doublecomplex *a, *aa, *as, *b, *bb, *bs, *c__, *cc, *cs, *ct;
-doublereal *g;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2320,30 +2105,30 @@ ftnlen sname_len;
     static doublecomplex alpha;
     static doublereal rbeta;
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static doublereal rbets;
     static logical reset;
     static char trans[1];
     static logical upper;
     static char uplos[1];
     static integer ia, ib, jc, ma, na;
-    extern /* Subroutine */ int zprcn4_();
+    extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
     static integer nc;
-    extern /* Subroutine */ int zprcn6_();
+    extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ik, in, jj, lj, ks, ns;
     static doublereal ralpha;
-    extern /* Subroutine */ int czherk_();
+    extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen);
     static doublereal errmax;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static char transs[1], transt[1];
-    extern /* Subroutine */ int czsyrk_();
+    extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lda, lcc, ldc;
     static doublecomplex als;
     static integer ict, icu;
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHERK and ZSYRK. */
 
@@ -2732,20 +2517,7 @@ L130:
 } /* zchk4_ */
 
 
-/* Subroutine */ int zprcn4_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublecomplex *alpha;
-integer *lda;
-doublecomplex *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -2775,20 +2547,7 @@ return 0;
 
 
 
-/* Subroutine */ int zprcn6_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublereal *alpha;
-integer *lda;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
 
     /* Local variables */
@@ -2818,23 +2577,7 @@ return 0;
 } /* zprcn6_ */
 
 
-/* Subroutine */ int zchk5_(sname, eps, thresh, nout, ntra, trace, rewi, 
-	fatal, nidim, idim, nalf, alf, nbet, bet, nmax, ab, aa, as, bb, bs, 
-	c__, cc, cs, ct, g, w, iorder, sname_len)
-char *sname;
-doublereal *eps, *thresh;
-integer *nout, *ntra;
-logical *trace, *rewi, *fatal;
-integer *nidim, *idim, *nalf;
-doublecomplex *alf;
-integer *nbet;
-doublecomplex *bet;
-integer *nmax;
-doublecomplex *ab, *aa, *as, *bb, *bs, *c__, *cc, *cs, *ct;
-doublereal *g;
-doublecomplex *w;
-integer *iorder;
-ftnlen sname_len;
+/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len)
 {
     /* Initialized data */
 
@@ -2857,27 +2600,28 @@ ftnlen sname_len;
     static doublecomplex alpha;
     static doublereal rbeta;
     static logical isame[13];
-    extern /* Subroutine */ int zmake_();
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
     static integer nargs;
-    extern /* Subroutine */ int zmmch_();
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
     static doublereal rbets;
     static logical reset;
     static char trans[1];
     static logical upper;
     static char uplos[1];
     static integer ia, ib, jc, ma, na, nc;
-    extern /* Subroutine */ int zprcn5_(), zprcn7_();
+    extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
     static integer ik, in, jj, lj, ks, ns;
     static doublereal errmax;
-    extern logical lzeres_();
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static char transs[1], transt[1];
-    extern /* Subroutine */ int czher2k_();
+    extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen);
     static integer laa, lbb, lda, lcc, ldb, ldc;
     static doublecomplex als;
     static integer ict, icu;
-    extern /* Subroutine */ int czsyr2k_();
+    extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
     static doublereal err;
-    extern logical lze_();
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
 
 /*  Tests ZHER2K and ZSYR2K. */
 
@@ -3349,20 +3093,7 @@ L160:
 } /* zchk5_ */
 
 
-/* Subroutine */ int zprcn5_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublecomplex *alpha;
-integer *lda, *ldb;
-doublecomplex *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
     /* Local variables */
     static char ca[14], cu[14], crc[14];
@@ -3392,20 +3123,7 @@ return 0;
 
 
 
-/* Subroutine */ int zprcn7_(nout, nc, sname, iorder, uplo, transa, n, k, 
-	alpha, lda, ldb, beta, ldc, sname_len, uplo_len, transa_len)
-integer *nout, *nc;
-char *sname;
-integer *iorder;
-char *uplo, *transa;
-integer *n, *k;
-doublecomplex *alpha;
-integer *lda, *ldb;
-doublereal *beta;
-integer *ldc;
-ftnlen sname_len;
-ftnlen uplo_len;
-ftnlen transa_len;
+/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
 {
 
     /* Local variables */
@@ -3435,19 +3153,7 @@ return 0;
 } /* zprcn7_ */
 
 
-/* Subroutine */ int zmake_(type__, uplo, diag, m, n, a, nmax, aa, lda, reset,
-	 transl, type_len, uplo_len, diag_len)
-char *type__, *uplo, *diag;
-integer *m, *n;
-doublecomplex *a;
-integer *nmax;
-doublecomplex *aa;
-integer *lda;
-logical *reset;
-doublecomplex *transl;
-ftnlen type_len;
-ftnlen uplo_len;
-ftnlen diag_len;
+/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
 {
     /* System generated locals */
     integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
@@ -3456,7 +3162,7 @@ ftnlen diag_len;
 
     /* Local variables */
     static integer ibeg, iend;
-    extern /* Double Complex */ VOID zbeg_();
+    extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*);
     static logical unit;
     static integer i__, j;
     static logical lower, upper;
@@ -3629,27 +3335,7 @@ ftnlen diag_len;
 
 } /* zmake_ */
 
-/* Subroutine */ int zmmch_(transa, transb, m, n, kk, alpha, a, lda, b, ldb, 
-	beta, c__, ldc, ct, g, cc, ldcc, eps, err, fatal, nout, mv, 
-	transa_len, transb_len)
-char *transa, *transb;
-integer *m, *n, *kk;
-doublecomplex *alpha, *a;
-integer *lda;
-doublecomplex *b;
-integer *ldb;
-doublecomplex *beta, *c__;
-integer *ldc;
-doublecomplex *ct;
-doublereal *g;
-doublecomplex *cc;
-integer *ldcc;
-doublereal *eps, *err;
-logical *fatal;
-integer *nout;
-logical *mv;
-ftnlen transa_len;
-ftnlen transb_len;
+/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len)
 {
 
     /* System generated locals */
@@ -3658,7 +3344,7 @@ ftnlen transb_len;
     doublereal d__1, d__2, d__3, d__4, d__5, d__6;
     doublecomplex z__1, z__2, z__3, z__4;
 
-    double sqrt();
+    double sqrt(double);
     /* Local variables */
     static doublereal erri;
     static integer i__, j, k;
@@ -4031,9 +3717,7 @@ L250:
 
 } /* zmmch_ */
 
-logical lze_(ri, rj, lr)
-doublecomplex *ri, *rj;
-integer *lr;
+logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr)
 {
     /* System generated locals */
     integer i__1, i__2, i__3;
@@ -4082,13 +3766,7 @@ L30:
 
 } /* lze_ */
 
-logical lzeres_(type__, uplo, m, n, aa, as, lda, type_len, uplo_len)
-char *type__, *uplo;
-integer *m, *n;
-doublecomplex *aa, *as;
-integer *lda;
-ftnlen type_len;
-ftnlen uplo_len;
+logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
 {
     /* System generated locals */
     integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
@@ -4184,9 +3862,7 @@ L80:
 
 } /* lzeres_ */
 
-/* Double Complex */ VOID zbeg_( ret_val, reset)
-doublecomplex * ret_val;
-logical *reset;
+/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset)
 {
     /* System generated locals */
     doublereal d__1, d__2;
@@ -4249,8 +3925,7 @@ L10:
 
 } /* zbeg_ */
 
-doublereal ddiff_(x, y)
-doublereal *x, *y;
+doublereal ddiff_(doublereal* x, doublereal* y)
 {
     /* System generated locals */
     doublereal ret_val;
diff --git a/ctest/c_zblat3c_3m.c b/ctest/c_zblat3c_3m.c
new file mode 100644
index 000000000..0c76f11e7
--- /dev/null
+++ b/ctest/c_zblat3c_3m.c
@@ -0,0 +1,3951 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#include "common.h"
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+
+
+/* Common Block Declarations */
+
+struct {
+    integer infot, noutc;
+    logical ok, lerr;
+} infoc_;
+
+#define infoc_1 infoc_
+
+struct {
+    char srnamt[12];
+} srnamc_;
+
+#define srnamc_1 srnamc_
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {0.,0.};
+static doublecomplex c_b2 = {1.,0.};
+static integer c__1 = 1;
+static integer c__65 = 65;
+static doublereal c_b92 = 1.;
+static integer c__6 = 6;
+static logical c_true = TRUE_;
+static integer c__0 = 0;
+static logical c_false = FALSE_;
+
+/* Main program  MAIN__() */ int main(void)
+{
+    /* Initialized data */
+
+    static char snames[9][13] = { "cblas_zgemm3m ", "cblas_zhemm ", "cblas_zsymm ", "cblas_ztrmm ",
+     "cblas_ztrsm ", "cblas_zherk ", "cblas_zsyrk ", "cblas_zher2k", "cblas_zsyr2k"};
+
+    /* System generated locals */
+    integer i__1, i__2, i__3, i__4, i__5;
+    doublereal d__1;
+
+    /* Builtin functions */
+    integer s_rsle(void), do_lio(void), e_rsle(void), f_open(void), s_wsfe(void), do_fio(void), 
+	    e_wsfe(void), s_wsle(void), e_wsle(void), s_rsfe(void), e_rsfe(void);
+
+    /* Local variables */
+    static integer nalf, idim[9];
+    static logical same;
+    static integer nbet, ntra;
+    static logical rewi;
+    extern /* Subroutine */ int zchk1_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk2_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk3_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk4_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, integer*, ftnlen);
+    extern /* Subroutine */ int zchk5_(char*, doublereal*, doublereal*, integer*, integer*, logical*, logical*, logical*, integer*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublecomplex*, doublereal*, doublecomplex*, integer*, ftnlen);
+    static doublecomplex c__[4225]	/* was [65][65] */;
+    static doublereal g[65];
+    static integer i__, j;
+    extern doublereal ddiff_(doublereal*, doublereal*);
+    static integer n;
+    static logical fatal;
+    static doublecomplex w[130];
+    static logical trace;
+    static integer nidim;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static char snaps[32];
+    static integer isnum;
+    static logical ltest[9];
+    static doublecomplex aa[4225], ab[8450]	/* was [65][130] */, bb[4225],
+	     cc[4225], as[4225], bs[4225], cs[4225], ct[65];
+    static logical sfatal, corder;
+    static char snamet[12], transa[1], transb[1];
+    static doublereal thresh;
+    static logical rorder;
+    static integer layout;
+    static logical ltestt, tsterr;
+    extern /* Subroutine */ int cz3chke_(char*, ftnlen);
+    static doublecomplex alf[7], bet[7];
+    static doublereal eps, err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
+    char tmpchar;
+    
+/*  Test program for the COMPLEX*16          Level 3 Blas. */
+
+/*  The program must be driven by a short data file. The first 13 records */
+/*  of the file are read using list-directed input, the last 9 records */
+/*  are read using the format ( A12,L2 ). An annotated example of a data */
+/*  file can be obtained by deleting the first 3 characters from the */
+/*  following 22 lines: */
+/*  'CBLAT3.SNAP'     NAME OF SNAPSHOT OUTPUT FILE */
+/*  -1                UNIT NUMBER OF SNAPSHOT FILE (NOT USED IF .LT. 0) */
+/*  F        LOGICAL FLAG, T TO REWIND SNAPSHOT FILE AFTER EACH RECORD. */
+/*  F        LOGICAL FLAG, T TO STOP ON FAILURES. */
+/*  T        LOGICAL FLAG, T TO TEST ERROR EXITS. */
+/*  2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH */
+/*  16.0     THRESHOLD VALUE OF TEST RATIO */
+/*  6                 NUMBER OF VALUES OF N */
+/*  0 1 2 3 5 9       VALUES OF N */
+/*  3                 NUMBER OF VALUES OF ALPHA */
+/*  (0.0,0.0) (1.0,0.0) (0.7,-0.9)       VALUES OF ALPHA */
+/*  3                 NUMBER OF VALUES OF BETA */
+/*  (0.0,0.0) (1.0,0.0) (1.3,-1.1)       VALUES OF BETA */
+/*  ZGEMM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZHEMM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZSYMM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZTRMM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZTRSM  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZHERK  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZSYRK  T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZHER2K T PUT F FOR NO TEST. SAME COLUMNS. */
+/*  ZSYR2K T PUT F FOR NO TEST. SAME COLUMNS. */
+
+/*  See: */
+
+/*     Dongarra J. J., Du Croz J. J., Duff I. S. and Hammarling S. */
+/*     A Set of Level 3 Basic Linear Algebra Subprograms. */
+
+/*     Technical Memorandum No.88 (Revision 1), Mathematics and */
+/*     Computer Science Division, Argonne National Laboratory, 9700 */
+/*     South Cass Avenue, Argonne, Illinois 60439, US. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Parameters .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
+/*     .. Executable Statements .. */
+
+    infoc_1.noutc = 6;
+
+/*     Read name and unit number for snapshot output file and open file. */
+
+    char line[80];
+    
+    fgets(line,80,stdin);
+    sscanf(line,"'%s'",snaps);
+    fgets(line,80,stdin);
+#ifdef USE64BITINT
+    sscanf(line,"%ld",&ntra);
+#else
+    sscanf(line,"%d",&ntra);
+#endif
+    trace = ntra >= 0;
+    if (trace) {
+/*	o__1.oerr = 0;
+	o__1.ounit = ntra;
+	o__1.ofnmlen = 32;
+	o__1.ofnm = snaps;
+	o__1.orl = 0;
+	o__1.osta = "NEW";
+	o__1.oacc = 0;
+	o__1.ofm = 0;
+	o__1.oblnk = 0;
+	f_open(&o__1);*/
+    }
+/*     Read the flag that directs rewinding of the snapshot file. */
+   fgets(line,80,stdin);
+   sscanf(line,"%d",&rewi);
+   rewi = rewi && trace;
+/*     Read the flag that directs stopping on any failure. */
+   fgets(line,80,stdin);
+   sscanf(line,"%c",&tmpchar);
+   sfatal=FALSE_;
+   if (tmpchar=='T')sfatal=TRUE_;
+/*     Read the flag that indicates whether error exits are to be tested. */
+   fgets(line,80,stdin);
+   sscanf(line,"%c",&tmpchar);
+   tsterr=FALSE_;
+   if (tmpchar=='T')tsterr=TRUE_;
+/*     Read the flag that indicates whether row-major data layout to be tested. */
+   fgets(line,80,stdin);
+   sscanf(line,"%d",&layout);
+/*     Read the threshold value of the test ratio */
+   fgets(line,80,stdin);
+   sscanf(line,"%lf",&thresh);
+
+/*     Read and check the parameter values for the tests. */
+
+/*     Values of N */
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%d",&nidim);
+#else
+   sscanf(line,"%d",&nidim);
+#endif
+    if (nidim < 1 || nidim > 9) {
+        fprintf(stderr,"NUMBER OF VALUES OF N IS LESS THAN 1 OR GREATER THAN 9");
+        goto L220;
+    }
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld %ld %ld %ld %ld %ld %ld %ld %ld",&idim[0],&idim[1],&idim[2],
+    &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]);
+#else
+   sscanf(line,"%d %d %d %d %d %d %d %d %d",&idim[0],&idim[1],&idim[2],
+    &idim[3],&idim[4],&idim[5],&idim[6],&idim[7],&idim[8]);
+#endif
+    i__1 = nidim;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+        if (idim[i__ - 1] < 0 || idim[i__ - 1] > 65) {
+        fprintf(stderr,"VALUE OF N IS LESS THAN 0 OR GREATER THAN 65\n");
+            goto L220;
+        }
+/* L10: */
+    }
+/*     Values of ALPHA */
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nalf);
+#else
+   sscanf(line,"%d",&nalf);
+#endif
+    if (nalf < 1 || nalf > 7) {
+        fprintf(stderr,"VALUE OF ALPHA IS LESS THAN 0 OR GREATER THAN 7\n");
+        goto L220;
+    }
+   fgets(line,80,stdin);
+   sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&alf[0].r,&alf[0].i,&alf[1].r,&alf[1].i,&alf[2].r,&alf[2].i,&alf[3].r,&alf[3].i,
+   &alf[4].r,&alf[4].i,&alf[5].r,&alf[5].i,&alf[6].r,&alf[6].i);
+
+/*     Values of BETA */
+   fgets(line,80,stdin);
+#ifdef USE64BITINT
+   sscanf(line,"%ld",&nbet);
+#else
+   sscanf(line,"%d",&nbet);
+#endif
+    if (nalf < 1 || nbet > 7) {
+        fprintf(stderr,"VALUE OF BETA IS LESS THAN 0 OR GREATER THAN 7\n");
+        goto L220;
+    }
+   fgets(line,80,stdin);
+   sscanf(line,"(%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf) (%lf,%lf)",&bet[0].r,&bet[0].i,&bet[1].r,&bet[1].i,&bet[2].r,&bet[2].i,&bet[3].r,&bet[3].i,
+   &bet[4].r,&bet[4].i,&bet[5].r,&bet[5].i,&bet[6].r,&bet[6].i);
+
+/*     Report values of parameters. */
+
+    printf("TESTS OF THE DOUBLE PRECISION COMPLEX LEVEL 3 BLAS\nTHE FOLLOWING PARAMETER VALUES WILL BE USED:\n");
+    printf(" FOR N");
+    for (i__ =1; i__ <=nidim;++i__) printf(" %d",idim[i__-1]);
+    printf("\n");    
+    printf(" FOR ALPHA");
+    for (i__ =1; i__ <=nalf;++i__) printf(" (%lf,%lf)",alf[i__-1].r,alf[i__-1].i);
+    printf("\n");    
+    printf(" FOR BETA");
+    for (i__ =1; i__ <=nbet;++i__) printf(" (%lf,%lf)",bet[i__-1].r,bet[i__-1].i);
+    printf("\n");    
+
+    if (! tsterr) {
+      printf(" ERROR-EXITS WILL NOT BE TESTED\n"); 
+    }
+
+    printf("ROUTINES PASS COMPUTATIONAL TESTS IF TEST RATIO IS LESS THAN %lf\n",thresh);
+    rorder = FALSE_;
+    corder = FALSE_;
+    if (layout == 2) {
+	rorder = TRUE_;
+	corder = TRUE_;
+        printf("COLUMN-MAJOR AND ROW-MAJOR DATA LAYOUTS ARE TESTED\n");
+    } else if (layout == 1) {
+	rorder = TRUE_;
+        printf("ROW-MAJOR DATA LAYOUT IS TESTED\n");
+    } else if (layout == 0) {
+	corder = TRUE_;
+        printf("COLUMN-MAJOR DATA LAYOUT IS TESTED\n");
+    }
+
+/*     Read names of subroutines and flags which indicate */
+/*     whether they are to be tested. */
+
+    for (i__ = 1; i__ <= 9; ++i__) {
+	ltest[i__ - 1] = FALSE_;
+/* L20: */
+    }
+L30:
+   if (! fgets(line,80,stdin)) {
+        goto L60;
+    }
+   i__1 = sscanf(line,"%12c %c",snamet,&tmpchar);
+   ltestt=FALSE_;
+   if (tmpchar=='T')ltestt=TRUE_;
+    if (i__1 < 2) {
+        goto L60;
+    }
+    for (i__ = 1; i__ <= 9; ++i__) {
+        if (s_cmp(snamet, snames[i__ - 1] , (ftnlen)12, (ftnlen)12) == 
+                0) {
+            goto L50;
+        }
+/* L40: */
+    }
+    printf("SUBPROGRAM NAME %s NOT RECOGNIZED\n****** TESTS ABANDONED ******\n",snamet);
+    exit(1);
+L50:
+    ltest[i__ - 1] = ltestt;
+    goto L30;
+
+L60:
+/*    cl__1.cerr = 0;
+    cl__1.cunit = 5;
+    cl__1.csta = 0;
+    f_clos(&cl__1);*/
+
+/*     Compute EPS (the machine precision). */
+
+    eps = 1.;
+L70:
+    d__1 = eps + 1.;
+    if (ddiff_(&d__1, &c_b92) == 0.) {
+	goto L80;
+    }
+    eps *= .5;
+    goto L70;
+L80:
+    eps += eps;
+    printf("RELATIVE MACHINE PRECISION IS TAKEN TO BE %9.1g\n",eps);
+
+/*     Check the reliability of ZMMCH using exact data. */
+
+    n = 32;
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = n;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * 65 - 66;
+/* Computing MAX */
+	    i__5 = i__ - j + 1;
+	    i__4 = f2cmax(i__5,0);
+	    ab[i__3].r = (doublereal) i__4, ab[i__3].i = 0.;
+/* L90: */
+	}
+	i__2 = j + 4224;
+	ab[i__2].r = (doublereal) j, ab[i__2].i = 0.;
+	i__2 = (j + 65) * 65 - 65;
+	ab[i__2].r = (doublereal) j, ab[i__2].i = 0.;
+	i__2 = j - 1;
+	c__[i__2].r = 0., c__[i__2].i = 0.;
+/* L100: */
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j - 1;
+	i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3;
+	cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.;
+/* L110: */
+    }
+/*     CC holds the exact result. On exit from ZMMCH CT holds */
+/*     the result computed by ZMMCH. */
+    *(unsigned char *)transa = 'N';
+    *(unsigned char *)transb = 'N';
+    zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true, (ftnlen)1, (ftnlen)1);
+    same = lze_(cc, ct, &n);
+    if (! same || err != 0.) {
+      printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
+    }
+    *(unsigned char *)transb = 'C';
+    zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true, (ftnlen)1, (ftnlen)1);
+    same = lze_(cc, ct, &n);
+    if (! same || err != 0.) {
+      printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = j + 4224;
+	i__3 = n - j + 1;
+	ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.;
+	i__2 = (j + 65) * 65 - 65;
+	i__3 = n - j + 1;
+	ab[i__2].r = (doublereal) i__3, ab[i__2].i = 0.;
+/* L120: */
+    }
+    i__1 = n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = n - j;
+	i__3 = j * ((j + 1) * j) / 2 - (j + 1) * j * (j - 1) / 3;
+	cc[i__2].r = (doublereal) i__3, cc[i__2].i = 0.;
+/* L130: */
+    }
+    *(unsigned char *)transa = 'C';
+    *(unsigned char *)transb = 'N';
+    zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true, (ftnlen)1, (ftnlen)1);
+    same = lze_(cc, ct, &n);
+    if (! same || err != 0.) {
+      printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
+    }
+    *(unsigned char *)transb = 'C';
+    zmmch_(transa, transb, &n, &c__1, &n, &c_b2, ab, &c__65, &ab[4225], &
+	    c__65, &c_b1, c__, &c__65, ct, g, cc, &c__65, &eps, &err, &fatal, 
+	    &c__6, &c_true, (ftnlen)1, (ftnlen)1);
+    same = lze_(cc, ct, &n);
+    if (! same || err != 0.) {
+      printf("ERROR IN ZMMCH - IN-LINE DOT PRODUCTS ARE BEING EVALUATED WRONGLY\n");
+      printf("ZMMCH WAS CALLED WITH TRANSA = %s AND TRANSB = %s\n", transa,transb);
+      printf("AND RETURNED SAME = %c AND ERR = %12.3f.\n",(same==FALSE_? 'F':'T'),err);
+      printf("THIS MAY BE DUE TO FAULTS IN THE ARITHMETIC OR THE COMPILER.\n");
+      printf("****** TESTS ABANDONED ******\n");
+      exit(1);
+    }
+
+/*     Test each subroutine in turn. */
+
+    for (isnum = 1; isnum <= 9; ++isnum) {
+	if (! ltest[isnum - 1]) {
+/*           Subprogram is not to be tested. */
+           printf("%12s WAS NOT TESTED\n",snames[isnum-1]);
+	} else {
+	    s_copy(srnamc_1.srnamt, snames[isnum - 1], (ftnlen)12, (
+		    ftnlen)12);
+/*           Test error exits. */
+	    if (tsterr) {
+		cz3chke_(snames[isnum - 1], (ftnlen)12);
+	    }
+/*           Test computations. */
+	    infoc_1.infot = 0;
+	    infoc_1.ok = TRUE_;
+	    fatal = FALSE_;
+	    switch ((int)isnum) {
+		case 1:  goto L140;
+		case 2:  goto L150;
+		case 3:  goto L150;
+		case 4:  goto L160;
+		case 5:  goto L160;
+		case 6:  goto L170;
+		case 7:  goto L170;
+		case 8:  goto L180;
+		case 9:  goto L180;
+	    }
+/*           Test ZGEMM, 01. */
+L140:
+	    if (corder) {
+		zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0, (ftnlen)12);
+	    }
+	    if (rorder) {
+		zchk1_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1, (ftnlen)12);
+	    }
+	    goto L190;
+/*           Test ZHEMM, 02, ZSYMM, 03. */
+L150:
+	    if (corder) {
+		zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0, (ftnlen)12);
+	    }
+	    if (rorder) {
+		zchk2_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1, (ftnlen)12);
+	    }
+	    goto L190;
+/*           Test ZTRMM, 04, ZTRSM, 05. */
+L160:
+	    if (corder) {
+		zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
+			c__0, (ftnlen)12);
+	    }
+	    if (rorder) {
+		zchk3_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			c__65, ab, aa, as, &ab[4225], bb, bs, ct, g, c__, &
+			c__1, (ftnlen)12);
+	    }
+	    goto L190;
+/*           Test ZHERK, 06, ZSYRK, 07. */
+L170:
+	    if (corder) {
+		zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__0, (ftnlen)12);
+	    }
+	    if (rorder) {
+		zchk4_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, &ab[4225], bb, bs, c__,
+			 cc, cs, ct, g, &c__1, (ftnlen)12);
+	    }
+	    goto L190;
+/*           Test ZHER2K, 08, ZSYR2K, 09. */
+L180:
+	    if (corder) {
+		zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
+			ct, g, w, &c__0, (ftnlen)12);
+	    }
+	    if (rorder) {
+		zchk5_(snames[isnum - 1], &eps, &thresh, &c__6, &ntra,
+			 &trace, &rewi, &fatal, &nidim, idim, &nalf, alf, &
+			nbet, bet, &c__65, ab, aa, as, bb, bs, c__, cc, cs, 
+			ct, g, w, &c__1, (ftnlen)12);
+	    }
+	    goto L190;
+
+L190:
+	    if (fatal && sfatal) {
+		goto L210;
+	    }
+	}
+/* L200: */
+    }
+    printf("\nEND OF TESTS\n");
+    goto L230;
+
+L210:
+    printf("\n****** FATAL ERROR - TESTS ABANDONED ******\n");
+    goto L230;
+
+L220:
+    printf("AMEND DATA FILE OR INCREASE ARRAY SIZES IN PROGRAM\n");
+    printf("****** TESTS ABANDONED ******\n");
+
+L230:
+    if (trace) {
+/*	cl__1.cerr = 0;
+	cl__1.cunit = ntra;
+	cl__1.csta = 0;
+	f_clos(&cl__1);*/
+    }
+/*    cl__1.cerr = 0;
+    cl__1.cunit = 6;
+    cl__1.csta = 0;
+    f_clos(&cl__1);*/
+    exit(0);
+
+/*     End of ZBLAT3. */
+
+} /* MAIN__ */
+
+/* Subroutine */ int zchk1_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
+{
+    /* Initialized data */
+
+    static char ich[3+1] = "NTC";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7, i__8;
+
+    /* Local variables */
+    static doublecomplex beta;
+    static integer ldas, ldbs, ldcs;
+    static logical same, null;
+    static integer i__, k, m, n;
+    static doublecomplex alpha;
+    static logical isame[13], trana, tranb;
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static logical reset;
+    static integer ia, ib;
+    extern /* Subroutine */ int zprcn1_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer ma, mb, na, nb, nc, ik, im, in, ks, ms, ns;
+    extern /* Subroutine */ void czgemm3m_(integer*, char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static char tranas[1], tranbs[1], transa[1], transb[1];
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static integer ica, icb, laa, lbb, lda, lcc, ldb, ldc;
+    static doublecomplex als, bls;
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
+
+/*  Tests ZGEMM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+/*     .. Executable Statements .. */
+
+    nargs = 13;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDC to 1 more than minimum value if room. */
+	    ldc = m;
+	    if (ldc < *nmax) {
+		++ldc;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldc > *nmax) {
+		goto L100;
+	    }
+	    lcc = ldc * n;
+	    null = n <= 0 || m <= 0;
+
+	    i__3 = *nidim;
+	    for (ik = 1; ik <= i__3; ++ik) {
+		k = idim[ik];
+
+		for (ica = 1; ica <= 3; ++ica) {
+		    *(unsigned char *)transa = *(unsigned char *)&ich[ica - 1]
+			    ;
+		    trana = *(unsigned char *)transa == 'T' || *(unsigned 
+			    char *)transa == 'C';
+
+		    if (trana) {
+			ma = k;
+			na = m;
+		    } else {
+			ma = m;
+			na = k;
+		    }
+/*                 Set LDA to 1 more than minimum value if room. */
+		    lda = ma;
+		    if (lda < *nmax) {
+			++lda;
+		    }
+/*                 Skip tests if not enough room. */
+		    if (lda > *nmax) {
+			goto L80;
+		    }
+		    laa = lda * na;
+
+/*                 Generate the matrix A. */
+
+		    zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[
+			    1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (
+			    ftnlen)1);
+
+		    for (icb = 1; icb <= 3; ++icb) {
+			*(unsigned char *)transb = *(unsigned char *)&ich[icb 
+				- 1];
+			tranb = *(unsigned char *)transb == 'T' || *(unsigned 
+				char *)transb == 'C';
+
+			if (tranb) {
+			    mb = n;
+			    nb = k;
+			} else {
+			    mb = k;
+			    nb = n;
+			}
+/*                    Set LDB to 1 more than minimum value if room. */
+			ldb = mb;
+			if (ldb < *nmax) {
+			    ++ldb;
+			}
+/*                    Skip tests if not enough room. */
+			if (ldb > *nmax) {
+			    goto L70;
+			}
+			lbb = ldb * nb;
+
+/*                    Generate the matrix B. */
+
+			zmake_("ge", " ", " ", &mb, &nb, &b[b_offset], nmax, &
+				bb[1], &ldb, &reset, &c_b1, (ftnlen)2, (
+				ftnlen)1, (ftnlen)1);
+
+			i__4 = *nalf;
+			for (ia = 1; ia <= i__4; ++ia) {
+			    i__5 = ia;
+			    alpha.r = alf[i__5].r, alpha.i = alf[i__5].i;
+
+			    i__5 = *nbet;
+			    for (ib = 1; ib <= i__5; ++ib) {
+				i__6 = ib;
+				beta.r = bet[i__6].r, beta.i = bet[i__6].i;
+
+/*                          Generate the matrix C. */
+
+				zmake_("ge", " ", " ", &m, &n, &c__[c_offset],
+					 nmax, &cc[1], &ldc, &reset, &c_b1, (
+					ftnlen)2, (ftnlen)1, (ftnlen)1);
+
+				++nc;
+
+/*                          Save every datum before calling the */
+/*                          subroutine. */
+
+				*(unsigned char *)tranas = *(unsigned char *)
+					transa;
+				*(unsigned char *)tranbs = *(unsigned char *)
+					transb;
+				ms = m;
+				ns = n;
+				ks = k;
+				als.r = alpha.r, als.i = alpha.i;
+				i__6 = laa;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    as[i__7].r = aa[i__8].r, as[i__7].i = aa[
+					    i__8].i;
+/* L10: */
+				}
+				ldas = lda;
+				i__6 = lbb;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    bs[i__7].r = bb[i__8].r, bs[i__7].i = bb[
+					    i__8].i;
+/* L20: */
+				}
+				ldbs = ldb;
+				bls.r = beta.r, bls.i = beta.i;
+				i__6 = lcc;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    i__7 = i__;
+				    i__8 = i__;
+				    cs[i__7].r = cc[i__8].r, cs[i__7].i = cc[
+					    i__8].i;
+/* L30: */
+				}
+				ldcs = ldc;
+
+/*                          Call the subroutine. */
+
+				if (*trace) {
+				    zprcn1_(ntra, &nc, sname, iorder, transa, 
+					    transb, &m, &n, &k, &alpha, &lda, 
+					    &ldb, &beta, &ldc, (ftnlen)12, (
+					    ftnlen)1, (ftnlen)1);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);*/
+				}
+				czgemm3m_(iorder, transa, transb, &m, &n, &k, &
+					alpha, &aa[1], &lda, &bb[1], &ldb, &
+					beta, &cc[1], &ldc, (ftnlen)1, (
+					ftnlen)1);
+
+/*                          Check if error-exit was taken incorrectly. */
+
+				if (! infoc_1.ok) {
+                                    printf(" *** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				    *fatal = TRUE_;
+				    goto L120;
+				}
+
+/*                          See what data changed inside subroutines. */
+
+				isame[0] = *(unsigned char *)transa == *(
+					unsigned char *)tranas;
+				isame[1] = *(unsigned char *)transb == *(
+					unsigned char *)tranbs;
+				isame[2] = ms == m;
+				isame[3] = ns == n;
+				isame[4] = ks == k;
+				isame[5] = als.r == alpha.r && als.i == 
+					alpha.i;
+				isame[6] = lze_(&as[1], &aa[1], &laa);
+				isame[7] = ldas == lda;
+				isame[8] = lze_(&bs[1], &bb[1], &lbb);
+				isame[9] = ldbs == ldb;
+				isame[10] = bls.r == beta.r && bls.i == 
+					beta.i;
+				if (null) {
+				    isame[11] = lze_(&cs[1], &cc[1], &lcc);
+				} else {
+				    isame[11] = lzeres_("ge", " ", &m, &n, &
+					    cs[1], &cc[1], &ldc, (ftnlen)2, (
+					    ftnlen)1);
+				}
+				isame[12] = ldcs == ldc;
+
+/*                          If data was incorrectly changed, report */
+/*                          and return. */
+
+				same = TRUE_;
+				i__6 = nargs;
+				for (i__ = 1; i__ <= i__6; ++i__) {
+				    same = same && isame[i__ - 1];
+				    if (! isame[i__ - 1]) {
+	                                printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+				    }
+/* L40: */
+				}
+				if (! same) {
+				    *fatal = TRUE_;
+				    goto L120;
+				}
+
+				if (! null) {
+
+/*                             Check the result. */
+
+				    zmmch_(transa, transb, &m, &n, &k, &alpha,
+					     &a[a_offset], nmax, &b[b_offset],
+					     nmax, &beta, &c__[c_offset], 
+					    nmax, &ct[1], &g[1], &cc[1], &ldc,
+					     eps, &err, fatal, nout, &c_true, 
+					    (ftnlen)1, (ftnlen)1);
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L120;
+				    }
+				}
+
+/* L50: */
+			    }
+
+/* L60: */
+			}
+
+L70:
+			;
+		    }
+
+L80:
+		    ;
+		}
+
+/* L90: */
+	    }
+
+L100:
+	    ;
+	}
+
+/* L110: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L130;
+
+L120:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    zprcn1_(nout, &nc, sname, iorder, transa, transb, &m, &n, &k, &alpha, &
+	    lda, &ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
+
+L130:
+    return 0;
+
+/* 9995 FORMAT( 1X, I6, ': ', A12,'(''', A1, ''',''', A1, ''',', */
+/*     $     3( I3, ',' ), '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, */
+/*     $     ',(', F4.1, ',', F4.1, '), C,', I3, ').' ) */
+
+/*     End of ZCHK1. */
+
+} /* zchk1_ */
+
+
+/* Subroutine */ int zprcn1_(integer* nout, integer* nc, char* sname, integer* iorder, char* transa, char* transb, integer* m, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen transa_len, ftnlen transb_len)
+{
+    /* Local variables */
+    static char crc[14], cta[14], ctb[14];
+
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(cta, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(cta, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cta, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transb == 'N') {
+	s_copy(ctb, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transb == 'T') {
+	s_copy(ctb, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ctb, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cta,ctb);
+    printf("%d %d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
+
+return 0;
+} /* zprcn1_ */
+
+
+/* Subroutine */ int zchk2_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
+{
+    /* Initialized data */
+
+    static char ichs[2+1] = "LR";
+    static char ichu[2+1] = "UL";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+
+    /* Local variables */
+    static doublecomplex beta;
+    static integer ldas, ldbs, ldcs;
+    static logical same;
+    static char side[1];
+    static logical isconj, left, null;
+    static char uplo[1];
+    static integer i__, m, n;
+    static doublecomplex alpha;
+    static logical isame[13];
+    static char sides[1];
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static logical reset;
+    static char uplos[1];
+    static integer ia, ib;
+    extern /* Subroutine */ int zprcn2_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer na, nc, im, in, ms, ns;
+    extern /* Subroutine */ void czhemm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void czsymm_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static integer laa, lbb, lda, lcc, ldb, ldc, ics;
+    static doublecomplex als, bls;
+    static integer icu;
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
+
+/*  Tests ZHEMM and ZSYMM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+/*     .. Executable Statements .. */
+    isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 12;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDC to 1 more than minimum value if room. */
+	    ldc = m;
+	    if (ldc < *nmax) {
+		++ldc;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldc > *nmax) {
+		goto L90;
+	    }
+	    lcc = ldc * n;
+	    null = n <= 0 || m <= 0;
+/*           Set LDB to 1 more than minimum value if room. */
+	    ldb = m;
+	    if (ldb < *nmax) {
+		++ldb;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldb > *nmax) {
+		goto L90;
+	    }
+	    lbb = ldb * n;
+
+/*           Generate the matrix B. */
+
+	    zmake_("ge", " ", " ", &m, &n, &b[b_offset], nmax, &bb[1], &ldb, &
+		    reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1);
+
+	    for (ics = 1; ics <= 2; ++ics) {
+		*(unsigned char *)side = *(unsigned char *)&ichs[ics - 1];
+		left = *(unsigned char *)side == 'L';
+
+		if (left) {
+		    na = m;
+		} else {
+		    na = n;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = na;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L80;
+		}
+		laa = lda * na;
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+
+/*                 Generate the hermitian or symmetric matrix A. */
+
+		    zmake_(sname + 7, uplo, " ", &na, &na, &a[a_offset], nmax,
+			     &aa[1], &lda, &reset, &c_b1, (ftnlen)2, (ftnlen)
+			    1, (ftnlen)1);
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+
+/*                       Generate the matrix C. */
+
+			    zmake_("ge", " ", " ", &m, &n, &c__[c_offset], 
+				    nmax, &cc[1], &ldc, &reset, &c_b1, (
+				    ftnlen)2, (ftnlen)1, (ftnlen)1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the */
+/*                       subroutine. */
+
+			    *(unsigned char *)sides = *(unsigned char *)side;
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    ms = m;
+			    ns = n;
+			    als.r = alpha.r, als.i = alpha.i;
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    i__5 = lbb;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldbs = ldb;
+			    bls.r = beta.r, bls.i = beta.i;
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L30: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (*trace) {
+				zprcn2_(ntra, &nc, sname, iorder, side, uplo, 
+					&m, &n, &alpha, &lda, &ldb, &beta, &
+					ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1)
+					;
+			    }
+			    if (*rewi) {
+/*				al__1.aerr = 0;
+				al__1.aunit = *ntra;
+				f_rew(&al__1);*/
+			    }
+			    if (isconj) {
+				czhemm_(iorder, side, uplo, &m, &n, &alpha, &
+					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
+					1], &ldc, (ftnlen)1, (ftnlen)1);
+			    } else {
+				czsymm_(iorder, side, uplo, &m, &n, &alpha, &
+					aa[1], &lda, &bb[1], &ldb, &beta, &cc[
+					1], &ldc, (ftnlen)1, (ftnlen)1);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+			    	printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				*fatal = TRUE_;
+				goto L110;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)sides == *(unsigned 
+				    char *)side;
+			    isame[1] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[2] = ms == m;
+			    isame[3] = ns == n;
+			    isame[4] = als.r == alpha.r && als.i == alpha.i;
+			    isame[5] = lze_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    isame[7] = lze_(&bs[1], &bb[1], &lbb);
+			    isame[8] = ldbs == ldb;
+			    isame[9] = bls.r == beta.r && bls.i == beta.i;
+			    if (null) {
+				isame[10] = lze_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[10] = lzeres_("ge", " ", &m, &n, &cs[1],
+					 &cc[1], &ldc, (ftnlen)2, (ftnlen)1);
+			    }
+			    isame[11] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+                                printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+				}
+/* L40: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L110;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result. */
+
+				if (left) {
+				    zmmch_("N", "N", &m, &n, &m, &alpha, &a[
+					    a_offset], nmax, &b[b_offset], 
+					    nmax, &beta, &c__[c_offset], nmax,
+					     &ct[1], &g[1], &cc[1], &ldc, eps,
+					     &err, fatal, nout, &c_true, (
+					    ftnlen)1, (ftnlen)1);
+				} else {
+				    zmmch_("N", "N", &m, &n, &n, &alpha, &b[
+					    b_offset], nmax, &a[a_offset], 
+					    nmax, &beta, &c__[c_offset], nmax,
+					     &ct[1], &g[1], &cc[1], &ldc, eps,
+					     &err, fatal, nout, &c_true, (
+					    ftnlen)1, (ftnlen)1);
+				}
+				errmax = f2cmax(errmax,err);
+/*                          If got really bad answer, report and */
+/*                          return. */
+				if (*fatal) {
+				    goto L110;
+				}
+			    }
+
+/* L50: */
+			}
+
+/* L60: */
+		    }
+
+/* L70: */
+		}
+
+L80:
+		;
+	    }
+
+L90:
+	    ;
+	}
+
+/* L100: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L120;
+
+L110:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    zprcn2_(nout, &nc, sname, iorder, side, uplo, &m, &n, &alpha, &lda, &ldb, 
+	    &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
+
+L120:
+    return 0;
+
+/* 9995 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */
+/*     $      ',', F4.1, '), C,', I3, ')    .' ) */
+
+/*     End of ZCHK2. */
+
+} /* zchk2_ */
+
+
+/* Subroutine */ int zprcn2_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len)
+{
+    /* Local variables */
+    static char cs[14], cu[14], crc[14];
+
+    if (*(unsigned char *)side == 'L') {
+	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cs, "    CblasRight", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu);
+    printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*m,*n,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
+
+return 0;
+} /* zprcn2_ */
+
+
+/* Subroutine */ int zchk3_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* ct, doublereal* g, doublecomplex* c__, integer* iorder, ftnlen sname_len)
+{
+    /* Initialized data */
+
+    static char ichu[2+1] = "UL";
+    static char icht[3+1] = "NTC";
+    static char ichd[2+1] = "UN";
+    static char ichs[2+1] = "LR";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static char diag[1];
+    static integer ldas, ldbs;
+    static logical same;
+    static char side[1];
+    static logical left, null;
+    static char uplo[1];
+    static integer i__, j, m, n;
+    static doublecomplex alpha;
+    static char diags[1];
+    static logical isame[13];
+    static char sides[1];
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static logical reset;
+    static char uplos[1];
+    static integer ia, na;
+    extern /* Subroutine */ int zprcn3_(integer*, integer*, char*, integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, ftnlen, ftnlen, ftnlen, ftnlen, ftnlen);
+    static integer nc, im, in, ms, ns;
+    static char tranas[1], transa[1];
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrmm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ void cztrsm_(integer*, char*, char*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen, ftnlen);
+    static integer laa, icd, lbb, lda, ldb, ics;
+    static doublecomplex als;
+    static integer ict, icu;
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
+
+/*  Tests ZTRMM and ZTRSM. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --g;
+    --ct;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+/*     .. Executable Statements .. */
+
+    nargs = 11;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+/*     Set up zero matrix for ZMMCH. */
+    i__1 = *nmax;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *nmax;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__ + j * c_dim1;
+	    c__[i__3].r = 0., c__[i__3].i = 0.;
+/* L10: */
+	}
+/* L20: */
+    }
+
+    i__1 = *nidim;
+    for (im = 1; im <= i__1; ++im) {
+	m = idim[im];
+
+	i__2 = *nidim;
+	for (in = 1; in <= i__2; ++in) {
+	    n = idim[in];
+/*           Set LDB to 1 more than minimum value if room. */
+	    ldb = m;
+	    if (ldb < *nmax) {
+		++ldb;
+	    }
+/*           Skip tests if not enough room. */
+	    if (ldb > *nmax) {
+		goto L130;
+	    }
+	    lbb = ldb * n;
+	    null = m <= 0 || n <= 0;
+
+	    for (ics = 1; ics <= 2; ++ics) {
+		*(unsigned char *)side = *(unsigned char *)&ichs[ics - 1];
+		left = *(unsigned char *)side == 'L';
+		if (left) {
+		    na = m;
+		} else {
+		    na = n;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = na;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L130;
+		}
+		laa = lda * na;
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+
+		    for (ict = 1; ict <= 3; ++ict) {
+			*(unsigned char *)transa = *(unsigned char *)&icht[
+				ict - 1];
+
+			for (icd = 1; icd <= 2; ++icd) {
+			    *(unsigned char *)diag = *(unsigned char *)&ichd[
+				    icd - 1];
+
+			    i__3 = *nalf;
+			    for (ia = 1; ia <= i__3; ++ia) {
+				i__4 = ia;
+				alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+/*                          Generate the matrix A. */
+
+				zmake_("tr", uplo, diag, &na, &na, &a[
+					a_offset], nmax, &aa[1], &lda, &reset,
+					 &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)
+					1);
+
+/*                          Generate the matrix B. */
+
+				zmake_("ge", " ", " ", &m, &n, &b[b_offset], 
+					nmax, &bb[1], &ldb, &reset, &c_b1, (
+					ftnlen)2, (ftnlen)1, (ftnlen)1);
+
+				++nc;
+
+/*                          Save every datum before calling the */
+/*                          subroutine. */
+
+				*(unsigned char *)sides = *(unsigned char *)
+					side;
+				*(unsigned char *)uplos = *(unsigned char *)
+					uplo;
+				*(unsigned char *)tranas = *(unsigned char *)
+					transa;
+				*(unsigned char *)diags = *(unsigned char *)
+					diag;
+				ms = m;
+				ns = n;
+				als.r = alpha.r, als.i = alpha.i;
+				i__4 = laa;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    i__5 = i__;
+				    i__6 = i__;
+				    as[i__5].r = aa[i__6].r, as[i__5].i = aa[
+					    i__6].i;
+/* L30: */
+				}
+				ldas = lda;
+				i__4 = lbb;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    i__5 = i__;
+				    i__6 = i__;
+				    bs[i__5].r = bb[i__6].r, bs[i__5].i = bb[
+					    i__6].i;
+/* L40: */
+				}
+				ldbs = ldb;
+
+/*                          Call the subroutine. */
+
+				if (s_cmp(sname + 9, "mm", (ftnlen)2, (ftnlen)
+					2) == 0) {
+				    if (*trace) {
+					zprcn3_(ntra, &nc, sname, iorder, 
+						side, uplo, transa, diag, &m, 
+						&n, &alpha, &lda, &ldb, (
+						ftnlen)12, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1);
+				    }
+				    if (*rewi) {
+/*					al__1.aerr = 0;
+					al__1.aunit = *ntra;
+					f_rew(&al__1);*/
+				    }
+				    cztrmm_(iorder, side, uplo, transa, diag, 
+					    &m, &n, &alpha, &aa[1], &lda, &bb[
+					    1], &ldb, (ftnlen)1, (ftnlen)1, (
+					    ftnlen)1, (ftnlen)1);
+				} else if (s_cmp(sname + 9, "sm", (ftnlen)2, (
+					ftnlen)2) == 0) {
+				    if (*trace) {
+					zprcn3_(ntra, &nc, sname, iorder, 
+						side, uplo, transa, diag, &m, 
+						&n, &alpha, &lda, &ldb, (
+						ftnlen)12, (ftnlen)1, (ftnlen)
+						1, (ftnlen)1, (ftnlen)1);
+				    }
+				    if (*rewi) {
+/*					al__1.aerr = 0;
+					al__1.aunit = *ntra;
+					f_rew(&al__1);*/
+				    }
+				    cztrsm_(iorder, side, uplo, transa, diag, 
+					    &m, &n, &alpha, &aa[1], &lda, &bb[
+					    1], &ldb, (ftnlen)1, (ftnlen)1, (
+					    ftnlen)1, (ftnlen)1);
+				}
+
+/*                          Check if error-exit was taken incorrectly. */
+
+				if (! infoc_1.ok) {
+                                    printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				    *fatal = TRUE_;
+				    goto L150;
+				}
+
+/*                          See what data changed inside subroutines. */
+
+				isame[0] = *(unsigned char *)sides == *(
+					unsigned char *)side;
+				isame[1] = *(unsigned char *)uplos == *(
+					unsigned char *)uplo;
+				isame[2] = *(unsigned char *)tranas == *(
+					unsigned char *)transa;
+				isame[3] = *(unsigned char *)diags == *(
+					unsigned char *)diag;
+				isame[4] = ms == m;
+				isame[5] = ns == n;
+				isame[6] = als.r == alpha.r && als.i == 
+					alpha.i;
+				isame[7] = lze_(&as[1], &aa[1], &laa);
+				isame[8] = ldas == lda;
+				if (null) {
+				    isame[9] = lze_(&bs[1], &bb[1], &lbb);
+				} else {
+				    isame[9] = lzeres_("ge", " ", &m, &n, &bs[
+					    1], &bb[1], &ldb, (ftnlen)2, (
+					    ftnlen)1);
+				}
+				isame[10] = ldbs == ldb;
+
+/*                          If data was incorrectly changed, report and */
+/*                          return. */
+
+				same = TRUE_;
+				i__4 = nargs;
+				for (i__ = 1; i__ <= i__4; ++i__) {
+				    same = same && isame[i__ - 1];
+				    if (! isame[i__ - 1]) {
+                                        printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+				    }
+/* L50: */
+				}
+				if (! same) {
+				    *fatal = TRUE_;
+				    goto L150;
+				}
+
+				if (! null) {
+				    if (s_cmp(sname + 9, "mm", (ftnlen)2, (
+					    ftnlen)2) == 0) {
+
+/*                                Check the result. */
+
+					if (left) {
+					    zmmch_(transa, "N", &m, &n, &m, &
+						    alpha, &a[a_offset], nmax,
+						     &b[b_offset], nmax, &
+						    c_b1, &c__[c_offset], 
+						    nmax, &ct[1], &g[1], &bb[
+						    1], &ldb, eps, &err, 
+						    fatal, nout, &c_true, (
+						    ftnlen)1, (ftnlen)1);
+					} else {
+					    zmmch_("N", transa, &m, &n, &n, &
+						    alpha, &b[b_offset], nmax,
+						     &a[a_offset], nmax, &
+						    c_b1, &c__[c_offset], 
+						    nmax, &ct[1], &g[1], &bb[
+						    1], &ldb, eps, &err, 
+						    fatal, nout, &c_true, (
+						    ftnlen)1, (ftnlen)1);
+					}
+				    } else if (s_cmp(sname + 9, "sm", (ftnlen)
+					    2, (ftnlen)2) == 0) {
+
+/*                                Compute approximation to original */
+/*                                matrix. */
+
+					i__4 = n;
+					for (j = 1; j <= i__4; ++j) {
+					    i__5 = m;
+					    for (i__ = 1; i__ <= i__5; ++i__) 
+						    {
+			  i__6 = i__ + j * c_dim1;
+			  i__7 = i__ + (j - 1) * ldb;
+			  c__[i__6].r = bb[i__7].r, c__[i__6].i = bb[i__7].i;
+			  i__6 = i__ + (j - 1) * ldb;
+			  i__7 = i__ + j * b_dim1;
+			  z__1.r = alpha.r * b[i__7].r - alpha.i * b[i__7].i, 
+				  z__1.i = alpha.r * b[i__7].i + alpha.i * b[
+				  i__7].r;
+			  bb[i__6].r = z__1.r, bb[i__6].i = z__1.i;
+/* L60: */
+					    }
+/* L70: */
+					}
+
+					if (left) {
+					    zmmch_(transa, "N", &m, &n, &m, &
+						    c_b2, &a[a_offset], nmax, 
+						    &c__[c_offset], nmax, &
+						    c_b1, &b[b_offset], nmax, 
+						    &ct[1], &g[1], &bb[1], &
+						    ldb, eps, &err, fatal, 
+						    nout, &c_false, (ftnlen)1,
+						     (ftnlen)1);
+					} else {
+					    zmmch_("N", transa, &m, &n, &n, &
+						    c_b2, &c__[c_offset], 
+						    nmax, &a[a_offset], nmax, 
+						    &c_b1, &b[b_offset], nmax,
+						     &ct[1], &g[1], &bb[1], &
+						    ldb, eps, &err, fatal, 
+						    nout, &c_false, (ftnlen)1,
+						     (ftnlen)1);
+					}
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L150;
+				    }
+				}
+
+/* L80: */
+			    }
+
+/* L90: */
+			}
+
+/* L100: */
+		    }
+
+/* L110: */
+		}
+
+/* L120: */
+	    }
+
+L130:
+	    ;
+	}
+
+/* L140: */
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L160;
+
+L150:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    if (*trace) {
+	zprcn3_(ntra, &nc, sname, iorder, side, uplo, transa, diag, &m, &n, &
+		alpha, &lda, &ldb, (ftnlen)12, (ftnlen)1, (ftnlen)1, (ftnlen)
+		1, (ftnlen)1);
+    }
+
+L160:
+    return 0;
+
+/* 9995 FORMAT(1X, I6, ': ', A12,'(', 4( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $     '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ')         ', */
+/*     $      '      .' ) */
+
+/*     End of ZCHK3. */
+
+} /* zchk3_ */
+
+
+/* Subroutine */ int zprcn3_(integer* nout, integer* nc, char* sname, integer* iorder, char* side, char* uplo, char* transa, char* diag, integer* m, integer* n, doublecomplex* alpha, integer* lda, integer* ldb, ftnlen sname_len, ftnlen side_len, ftnlen uplo_len, ftnlen transa_len, ftnlen diag_len)
+{
+
+    /* Local variables */
+    static char ca[14], cd[14], cs[14], cu[14], crc[14];
+
+    if (*(unsigned char *)side == 'L') {
+	s_copy(cs, "     CblasLeft", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cs, "    CblasRight", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)diag == 'N') {
+	s_copy(cd, "  CblasNonUnit", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cd, "     CblasUnit", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cs,cu);
+    printf("         %s %s %d %d (%4.1lf,%4.1lf) A %d B %d\n",ca,cd,*m,*n,alpha->r,alpha->i,*lda,*ldb);
+
+return 0;
+} /* zprcn3_ */
+
+
+/* Subroutine */ int zchk4_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* a, doublecomplex* aa, doublecomplex* as, doublecomplex* b, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, integer* iorder, ftnlen sname_len)
+{
+    /* Initialized data */
+
+    static char icht[2+1] = "NC";
+    static char ichu[2+1] = "UL";
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, 
+	    i__3, i__4, i__5, i__6, i__7;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static doublecomplex beta;
+    static integer ldas, ldcs;
+    static logical same, isconj;
+    static doublecomplex bets;
+    static doublereal rals;
+    static logical tran, null;
+    static char uplo[1];
+    static integer i__, j, k, n;
+    static doublecomplex alpha;
+    static doublereal rbeta;
+    static logical isame[13];
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static doublereal rbets;
+    static logical reset;
+    static char trans[1];
+    static logical upper;
+    static char uplos[1];
+    static integer ia, ib, jc, ma, na;
+    extern /* Subroutine */ int zprcn4_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer nc;
+    extern /* Subroutine */ int zprcn6_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublereal*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer ik, in, jj, lj, ks, ns;
+    static doublereal ralpha;
+    extern /* Subroutine */ int czherk_(integer*, char*, char*, integer*, integer*, doublereal*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static char transs[1], transt[1];
+    extern /* Subroutine */ int czsyrk_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static integer laa, lda, lcc, ldc;
+    static doublecomplex als;
+    static integer ict, icu;
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
+
+/*  Tests ZHERK and ZSYRK. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    b_dim1 = *nmax;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --as;
+    --aa;
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+
+    /* Function Body */
+/*     .. Executable Statements .. */
+    isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 10;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+    rals = 1.;
+    rbets = 1.;
+
+    i__1 = *nidim;
+    for (in = 1; in <= i__1; ++in) {
+	n = idim[in];
+/*        Set LDC to 1 more than minimum value if room. */
+	ldc = n;
+	if (ldc < *nmax) {
+	    ++ldc;
+	}
+/*        Skip tests if not enough room. */
+	if (ldc > *nmax) {
+	    goto L100;
+	}
+	lcc = ldc * n;
+
+	i__2 = *nidim;
+	for (ik = 1; ik <= i__2; ++ik) {
+	    k = idim[ik];
+
+	    for (ict = 1; ict <= 2; ++ict) {
+		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
+		tran = *(unsigned char *)trans == 'C';
+		if (tran && ! isconj) {
+		    *(unsigned char *)trans = 'T';
+		}
+		if (tran) {
+		    ma = k;
+		    na = n;
+		} else {
+		    ma = n;
+		    na = k;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = ma;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L80;
+		}
+		laa = lda * na;
+
+/*              Generate the matrix A. */
+
+		zmake_("ge", " ", " ", &ma, &na, &a[a_offset], nmax, &aa[1], &
+			lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1);
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+		    upper = *(unsigned char *)uplo == 'U';
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+			if (isconj) {
+			    ralpha = alpha.r;
+			    z__1.r = ralpha, z__1.i = 0.;
+			    alpha.r = z__1.r, alpha.i = z__1.i;
+			}
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+			    if (isconj) {
+				rbeta = beta.r;
+				z__1.r = rbeta, z__1.i = 0.;
+				beta.r = z__1.r, beta.i = z__1.i;
+			    }
+			    null = n <= 0;
+			    if (isconj) {
+				null = null ||( (k <= 0 || ralpha == 0.) && 
+					rbeta == 1.);
+			    }
+
+/*                       Generate the matrix C. */
+
+			    zmake_(sname + 7, uplo, " ", &n, &n, &c__[
+				    c_offset], nmax, &cc[1], &ldc, &reset, &
+				    c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the subroutine. */
+
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    *(unsigned char *)transs = *(unsigned char *)
+				    trans;
+			    ns = n;
+			    ks = k;
+			    if (isconj) {
+				rals = ralpha;
+			    } else {
+				als.r = alpha.r, als.i = alpha.i;
+			    }
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    if (isconj) {
+				rbets = rbeta;
+			    } else {
+				bets.r = beta.r, bets.i = beta.i;
+			    }
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (isconj) {
+				if (*trace) {
+				    zprcn6_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &ralpha, &lda, &
+					    rbeta, &ldc, (ftnlen)12, (ftnlen)
+					    1, (ftnlen)1);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);*/
+				}
+				czherk_(iorder, uplo, trans, &n, &k, &ralpha, 
+					&aa[1], &lda, &rbeta, &cc[1], &ldc, (
+					ftnlen)1, (ftnlen)1);
+			    } else {
+				if (*trace) {
+				    zprcn4_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &
+					    beta, &ldc, (ftnlen)12, (ftnlen)1,
+					     (ftnlen)1);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);*/
+				}
+				czsyrk_(iorder, uplo, trans, &n, &k, &alpha, &
+					aa[1], &lda, &beta, &cc[1], &ldc, (
+					ftnlen)1, (ftnlen)1);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+                                printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				*fatal = TRUE_;
+				goto L120;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[1] = *(unsigned char *)transs == *(unsigned 
+				    char *)trans;
+			    isame[2] = ns == n;
+			    isame[3] = ks == k;
+			    if (isconj) {
+				isame[4] = rals == ralpha;
+			    } else {
+				isame[4] = als.r == alpha.r && als.i == 
+					alpha.i;
+			    }
+			    isame[5] = lze_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    if (isconj) {
+				isame[7] = rbets == rbeta;
+			    } else {
+				isame[7] = bets.r == beta.r && bets.i == 
+					beta.i;
+			    }
+			    if (null) {
+				isame[8] = lze_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[8] = lzeres_(sname + 7, uplo, &n, &n, &
+					cs[1], &cc[1], &ldc, (ftnlen)2, (
+					ftnlen)1);
+			    }
+			    isame[9] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+                                    printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+				}
+/* L30: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L120;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result column by column. */
+
+				if (isconj) {
+				    *(unsigned char *)transt = 'C';
+				} else {
+				    *(unsigned char *)transt = 'T';
+				}
+				jc = 1;
+				i__5 = n;
+				for (j = 1; j <= i__5; ++j) {
+				    if (upper) {
+					jj = 1;
+					lj = j;
+				    } else {
+					jj = j;
+					lj = n - j + 1;
+				    }
+				    if (tran) {
+					zmmch_(transt, "N", &lj, &c__1, &k, &
+						alpha, &a[jj * a_dim1 + 1], 
+						nmax, &a[j * a_dim1 + 1], 
+						nmax, &beta, &c__[jj + j * 
+						c_dim1], nmax, &ct[1], &g[1], 
+						&cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true, (ftnlen)
+						1, (ftnlen)1);
+				    } else {
+					zmmch_("N", transt, &lj, &c__1, &k, &
+						alpha, &a[jj + a_dim1], nmax, 
+						&a[j + a_dim1], nmax, &beta, &
+						c__[jj + j * c_dim1], nmax, &
+						ct[1], &g[1], &cc[jc], &ldc, 
+						eps, &err, fatal, nout, &
+						c_true, (ftnlen)1, (ftnlen)1);
+				    }
+				    if (upper) {
+					jc += ldc;
+				    } else {
+					jc = jc + ldc + 1;
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L110;
+				    }
+/* L40: */
+				}
+			    }
+
+/* L50: */
+			}
+
+/* L60: */
+		    }
+
+/* L70: */
+		}
+
+L80:
+		;
+	    }
+
+/* L90: */
+	}
+
+L100:
+	;
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L130;
+
+L110:
+    if (n > 1) {
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d:\n",j);
+    }
+
+L120:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    if (isconj) {
+	zprcn6_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &ralpha, &lda, 
+		&rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
+    } else {
+	zprcn4_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
+    }
+
+L130:
+    return 0;
+
+/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $     F4.1, ', A,', I3, ',', F4.1, ', C,', I3, ')               ', */
+/*     $      '          .' ) */
+/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, ') , A,', I3, ',(', F4.1, ',', F4.1, */
+/*     $      '), C,', I3, ')          .' ) */
+
+/*     End of CCHK4. */
+
+} /* zchk4_ */
+
+
+/* Subroutine */ int zprcn4_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
+{
+    /* Local variables */
+    static char ca[14], cu[14], crc[14];
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("(          %d %d (%4.1lf,%4.1lf) A %d (%4.1lf,%4.1lf) C %d\n",*n,*k,alpha->r,alpha->i,*lda,beta->r,beta->i,*ldc);
+
+return 0;
+} /* zprcn4_ */
+
+
+
+/* Subroutine */ int zprcn6_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublereal* alpha, integer* lda, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
+{
+
+    /* Local variables */
+    static char ca[14], cu[14], crc[14];
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("(          %d %d %4.1lf A %d %4.1lf C %d\n",*n,*k,*alpha,*lda,*beta,*ldc);
+
+return 0;
+} /* zprcn6_ */
+
+
+/* Subroutine */ int zchk5_(char* sname, doublereal* eps, doublereal* thresh, integer* nout, integer* ntra, logical* trace, logical* rewi, logical* fatal, integer* nidim, integer* idim, integer* nalf, doublecomplex* alf, integer* nbet, doublecomplex* bet, integer* nmax, doublecomplex* ab, doublecomplex* aa, doublecomplex* as, doublecomplex* bb, doublecomplex* bs, doublecomplex* c__, doublecomplex* cc, doublecomplex* cs, doublecomplex* ct, doublereal* g, doublecomplex* w, integer* iorder, ftnlen sname_len)
+{
+    /* Initialized data */
+
+    static char icht[2+1] = "NC";
+    static char ichu[2+1] = "UL";
+
+    /* System generated locals */
+    integer c_dim1, c_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7, i__8;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer jjab;
+    static doublecomplex beta;
+    static integer ldas, ldbs, ldcs;
+    static logical same, isconj;
+    static doublecomplex bets;
+    static logical tran, null;
+    static char uplo[1];
+    static integer i__, j, k, n;
+    static doublecomplex alpha;
+    static doublereal rbeta;
+    static logical isame[13];
+    extern /* Subroutine */ int zmake_(char*, char*, char*, integer*, integer*, doublecomplex*, integer*, doublecomplex*, integer*, logical*, doublecomplex*, ftnlen, ftnlen, ftnlen);
+    static integer nargs;
+    extern /* Subroutine */ int zmmch_(char*, char*, integer*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, doublereal*, doublecomplex*, integer*, doublereal*, doublereal*, logical*, integer*, logical*, ftnlen, ftnlen);
+    static doublereal rbets;
+    static logical reset;
+    static char trans[1];
+    static logical upper;
+    static char uplos[1];
+    static integer ia, ib, jc, ma, na, nc;
+    extern /* Subroutine */ int zprcn5_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublecomplex*, integer*, ftnlen, ftnlen, ftnlen);
+    extern /* Subroutine */ int zprcn7_(integer*, integer*, char*, integer*, char*, char*, integer*, integer*, doublecomplex*, integer*, integer*, doublereal*, integer*, ftnlen, ftnlen, ftnlen);
+    static integer ik, in, jj, lj, ks, ns;
+    static doublereal errmax;
+    extern logical lzeres_(char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static char transs[1], transt[1];
+    extern /* Subroutine */ int czher2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublereal*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static integer laa, lbb, lda, lcc, ldb, ldc;
+    static doublecomplex als;
+    static integer ict, icu;
+    extern /* Subroutine */ int czsyr2k_(integer*, char*, char*, integer*, integer*, doublecomplex*, doublecomplex*, integer*, doublecomplex*, integer*, doublecomplex*, doublecomplex*, integer*, ftnlen, ftnlen);
+    static doublereal err;
+    extern logical lze_(doublecomplex*, doublecomplex*, integer*);
+
+/*  Tests ZHER2K and ZSYR2K. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Local Arrays .. */
+/*     .. External Functions .. */
+/*     .. External Subroutines .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Scalars in Common .. */
+/*     .. Common blocks .. */
+/*     .. Data statements .. */
+    /* Parameter adjustments */
+    --idim;
+    --alf;
+    --bet;
+    --w;
+    --g;
+    --ct;
+    --cs;
+    --cc;
+    c_dim1 = *nmax;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --bs;
+    --bb;
+    --as;
+    --aa;
+    --ab;
+
+    /* Function Body */
+/*     .. Executable Statements .. */
+    isconj = s_cmp(sname + 7, "he", (ftnlen)2, (ftnlen)2) == 0;
+
+    nargs = 12;
+    nc = 0;
+    reset = TRUE_;
+    errmax = 0.;
+
+    i__1 = *nidim;
+    for (in = 1; in <= i__1; ++in) {
+	n = idim[in];
+/*        Set LDC to 1 more than minimum value if room. */
+	ldc = n;
+	if (ldc < *nmax) {
+	    ++ldc;
+	}
+/*        Skip tests if not enough room. */
+	if (ldc > *nmax) {
+	    goto L130;
+	}
+	lcc = ldc * n;
+
+	i__2 = *nidim;
+	for (ik = 1; ik <= i__2; ++ik) {
+	    k = idim[ik];
+
+	    for (ict = 1; ict <= 2; ++ict) {
+		*(unsigned char *)trans = *(unsigned char *)&icht[ict - 1];
+		tran = *(unsigned char *)trans == 'C';
+		if (tran && ! isconj) {
+		    *(unsigned char *)trans = 'T';
+		}
+		if (tran) {
+		    ma = k;
+		    na = n;
+		} else {
+		    ma = n;
+		    na = k;
+		}
+/*              Set LDA to 1 more than minimum value if room. */
+		lda = ma;
+		if (lda < *nmax) {
+		    ++lda;
+		}
+/*              Skip tests if not enough room. */
+		if (lda > *nmax) {
+		    goto L110;
+		}
+		laa = lda * na;
+
+/*              Generate the matrix A. */
+
+		if (tran) {
+		    i__3 = *nmax << 1;
+		    zmake_("ge", " ", " ", &ma, &na, &ab[1], &i__3, &aa[1], &
+			    lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)
+			    1);
+		} else {
+		    zmake_("ge", " ", " ", &ma, &na, &ab[1], nmax, &aa[1], &
+			    lda, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)
+			    1);
+		}
+
+/*              Generate the matrix B. */
+
+		ldb = lda;
+		lbb = laa;
+		if (tran) {
+		    i__3 = *nmax << 1;
+		    zmake_("ge", " ", " ", &ma, &na, &ab[k + 1], &i__3, &bb[1]
+			    , &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen)1, (
+			    ftnlen)1);
+		} else {
+		    zmake_("ge", " ", " ", &ma, &na, &ab[k * *nmax + 1], nmax,
+			     &bb[1], &ldb, &reset, &c_b1, (ftnlen)2, (ftnlen)
+			    1, (ftnlen)1);
+		}
+
+		for (icu = 1; icu <= 2; ++icu) {
+		    *(unsigned char *)uplo = *(unsigned char *)&ichu[icu - 1];
+		    upper = *(unsigned char *)uplo == 'U';
+
+		    i__3 = *nalf;
+		    for (ia = 1; ia <= i__3; ++ia) {
+			i__4 = ia;
+			alpha.r = alf[i__4].r, alpha.i = alf[i__4].i;
+
+			i__4 = *nbet;
+			for (ib = 1; ib <= i__4; ++ib) {
+			    i__5 = ib;
+			    beta.r = bet[i__5].r, beta.i = bet[i__5].i;
+			    if (isconj) {
+				rbeta = beta.r;
+				z__1.r = rbeta, z__1.i = 0.;
+				beta.r = z__1.r, beta.i = z__1.i;
+			    }
+			    null = n <= 0;
+			    if (isconj) {
+				null = null ||( (k <= 0 || (alpha.r == 0. && 
+					alpha.i == 0.)) && rbeta == 1.);
+			    }
+
+/*                       Generate the matrix C. */
+
+			    zmake_(sname + 7, uplo, " ", &n, &n, &c__[
+				    c_offset], nmax, &cc[1], &ldc, &reset, &
+				    c_b1, (ftnlen)2, (ftnlen)1, (ftnlen)1);
+
+			    ++nc;
+
+/*                       Save every datum before calling the subroutine. */
+
+			    *(unsigned char *)uplos = *(unsigned char *)uplo;
+			    *(unsigned char *)transs = *(unsigned char *)
+				    trans;
+			    ns = n;
+			    ks = k;
+			    als.r = alpha.r, als.i = alpha.i;
+			    i__5 = laa;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				as[i__6].r = aa[i__7].r, as[i__6].i = aa[i__7]
+					.i;
+/* L10: */
+			    }
+			    ldas = lda;
+			    i__5 = lbb;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				bs[i__6].r = bb[i__7].r, bs[i__6].i = bb[i__7]
+					.i;
+/* L20: */
+			    }
+			    ldbs = ldb;
+			    if (isconj) {
+				rbets = rbeta;
+			    } else {
+				bets.r = beta.r, bets.i = beta.i;
+			    }
+			    i__5 = lcc;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				i__6 = i__;
+				i__7 = i__;
+				cs[i__6].r = cc[i__7].r, cs[i__6].i = cc[i__7]
+					.i;
+/* L30: */
+			    }
+			    ldcs = ldc;
+
+/*                       Call the subroutine. */
+
+			    if (isconj) {
+				if (*trace) {
+				    zprcn7_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &ldb,
+					     &rbeta, &ldc, (ftnlen)12, (
+					    ftnlen)1, (ftnlen)1);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);*/
+				}
+				czher2k_(iorder, uplo, trans, &n, &k, &alpha, 
+					&aa[1], &lda, &bb[1], &ldb, &rbeta, &
+					cc[1], &ldc, (ftnlen)1, (ftnlen)1);
+			    } else {
+				if (*trace) {
+				    zprcn5_(ntra, &nc, sname, iorder, uplo, 
+					    trans, &n, &k, &alpha, &lda, &ldb,
+					     &beta, &ldc, (ftnlen)12, (ftnlen)
+					    1, (ftnlen)1);
+				}
+				if (*rewi) {
+/*				    al__1.aerr = 0;
+				    al__1.aunit = *ntra;
+				    f_rew(&al__1);*/
+				}
+				czsyr2k_(iorder, uplo, trans, &n, &k, &alpha, 
+					&aa[1], &lda, &bb[1], &ldb, &beta, &
+					cc[1], &ldc, (ftnlen)1, (ftnlen)1);
+			    }
+
+/*                       Check if error-exit was taken incorrectly. */
+
+			    if (! infoc_1.ok) {
+                                printf("*** FATAL ERROR - ERROR-CALL MYEXIT TAKEN ON VALID CALL\n");
+				*fatal = TRUE_;
+				goto L150;
+			    }
+
+/*                       See what data changed inside subroutines. */
+
+			    isame[0] = *(unsigned char *)uplos == *(unsigned 
+				    char *)uplo;
+			    isame[1] = *(unsigned char *)transs == *(unsigned 
+				    char *)trans;
+			    isame[2] = ns == n;
+			    isame[3] = ks == k;
+			    isame[4] = als.r == alpha.r && als.i == alpha.i;
+			    isame[5] = lze_(&as[1], &aa[1], &laa);
+			    isame[6] = ldas == lda;
+			    isame[7] = lze_(&bs[1], &bb[1], &lbb);
+			    isame[8] = ldbs == ldb;
+			    if (isconj) {
+				isame[9] = rbets == rbeta;
+			    } else {
+				isame[9] = bets.r == beta.r && bets.i == 
+					beta.i;
+			    }
+			    if (null) {
+				isame[10] = lze_(&cs[1], &cc[1], &lcc);
+			    } else {
+				isame[10] = lzeres_("he", uplo, &n, &n, &cs[1]
+					, &cc[1], &ldc, (ftnlen)2, (ftnlen)1);
+			    }
+			    isame[11] = ldcs == ldc;
+
+/*                       If data was incorrectly changed, report and */
+/*                       return. */
+
+			    same = TRUE_;
+			    i__5 = nargs;
+			    for (i__ = 1; i__ <= i__5; ++i__) {
+				same = same && isame[i__ - 1];
+				if (! isame[i__ - 1]) {
+                                    printf(" ******* FATAL ERROR - PARAMETER NUMBER %d WAS CHANGED INCORRECTLY *******\n",i__);
+    				}
+/* L40: */
+			    }
+			    if (! same) {
+				*fatal = TRUE_;
+				goto L150;
+			    }
+
+			    if (! null) {
+
+/*                          Check the result column by column. */
+
+				if (isconj) {
+				    *(unsigned char *)transt = 'C';
+				} else {
+				    *(unsigned char *)transt = 'T';
+				}
+				jjab = 1;
+				jc = 1;
+				i__5 = n;
+				for (j = 1; j <= i__5; ++j) {
+				    if (upper) {
+					jj = 1;
+					lj = j;
+				    } else {
+					jj = j;
+					lj = n - j + 1;
+				    }
+				    if (tran) {
+					i__6 = k;
+					for (i__ = 1; i__ <= i__6; ++i__) {
+					    i__7 = i__;
+					    i__8 = ((j - 1) << 1) * *nmax + k + 
+						    i__;
+					    z__1.r = alpha.r * ab[i__8].r - 
+						    alpha.i * ab[i__8].i, 
+						    z__1.i = alpha.r * ab[
+						    i__8].i + alpha.i * ab[
+						    i__8].r;
+					    w[i__7].r = z__1.r, w[i__7].i = 
+						    z__1.i;
+					    if (isconj) {
+			  i__7 = k + i__;
+			  d_cnjg(&z__2, &alpha);
+			  i__8 = ((j - 1) << 1) * *nmax + i__;
+			  z__1.r = z__2.r * ab[i__8].r - z__2.i * ab[i__8].i, 
+				  z__1.i = z__2.r * ab[i__8].i + z__2.i * ab[
+				  i__8].r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+					    } else {
+			  i__7 = k + i__;
+			  i__8 = ((j - 1) << 1) * *nmax + i__;
+			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+					    }
+/* L50: */
+					}
+					i__6 = k << 1;
+					i__7 = *nmax << 1;
+					i__8 = *nmax << 1;
+					zmmch_(transt, "N", &lj, &c__1, &i__6,
+						 &c_b2, &ab[jjab], &i__7, &w[
+						1], &i__8, &beta, &c__[jj + j 
+						* c_dim1], nmax, &ct[1], &g[1]
+						, &cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true, (ftnlen)
+						1, (ftnlen)1);
+				    } else {
+					i__6 = k;
+					for (i__ = 1; i__ <= i__6; ++i__) {
+					    if (isconj) {
+			  i__7 = i__;
+			  d_cnjg(&z__2, &ab[(k + i__ - 1) * *nmax + j]);
+			  z__1.r = alpha.r * z__2.r - alpha.i * z__2.i, 
+				  z__1.i = alpha.r * z__2.i + alpha.i * 
+				  z__2.r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+			  i__7 = k + i__;
+			  i__8 = (i__ - 1) * *nmax + j;
+			  z__2.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, z__2.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  d_cnjg(&z__1, &z__2);
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+					    } else {
+			  i__7 = i__;
+			  i__8 = (k + i__ - 1) * *nmax + j;
+			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+			  i__7 = k + i__;
+			  i__8 = (i__ - 1) * *nmax + j;
+			  z__1.r = alpha.r * ab[i__8].r - alpha.i * ab[i__8]
+				  .i, z__1.i = alpha.r * ab[i__8].i + alpha.i 
+				  * ab[i__8].r;
+			  w[i__7].r = z__1.r, w[i__7].i = z__1.i;
+					    }
+/* L60: */
+					}
+					i__6 = k << 1;
+					i__7 = *nmax << 1;
+					zmmch_("N", "N", &lj, &c__1, &i__6, &
+						c_b2, &ab[jj], nmax, &w[1], &
+						i__7, &beta, &c__[jj + j * 
+						c_dim1], nmax, &ct[1], &g[1], 
+						&cc[jc], &ldc, eps, &err, 
+						fatal, nout, &c_true, (ftnlen)
+						1, (ftnlen)1);
+				    }
+				    if (upper) {
+					jc += ldc;
+				    } else {
+					jc = jc + ldc + 1;
+					if (tran) {
+					    jjab += *nmax << 1;
+					}
+				    }
+				    errmax = f2cmax(errmax,err);
+/*                             If got really bad answer, report and */
+/*                             return. */
+				    if (*fatal) {
+					goto L140;
+				    }
+/* L70: */
+				}
+			    }
+
+/* L80: */
+			}
+
+/* L90: */
+		    }
+
+/* L100: */
+		}
+
+L110:
+		;
+	    }
+
+/* L120: */
+	}
+
+L130:
+	;
+    }
+
+/*     Report result. */
+
+    if (errmax < *thresh) {
+	if (*iorder == 0) {
+            printf("%s PASSED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+	if (*iorder == 1) {
+            printf("%s PASSED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)\n",sname,nc);
+	}
+    } else {
+	if (*iorder == 0) {
+            printf("%s COMPLETED THE COLUMN-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+	if (*iorder == 1) {
+            printf("%s COMPLETED THE ROW-MAJOR COMPUTATIONAL TESTS (%d CALLS)/n",sname,nc);
+            printf("***** BUT WITH MAXIMUM TEST RATIO %8.2f - SUSPECT *******/n",errmax);
+	}
+    }
+    goto L160;
+
+L140:
+    if (n > 1) {
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d:\n",j);
+    }
+
+L150:
+    printf(" ******* %s FAILED ON CALL NUMBER:\n",sname);
+    if (isconj) {
+	zprcn7_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		ldb, &rbeta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
+    } else {
+	zprcn5_(nout, &nc, sname, iorder, uplo, trans, &n, &k, &alpha, &lda, &
+		ldb, &beta, &ldc, (ftnlen)12, (ftnlen)1, (ftnlen)1);
+    }
+
+L160:
+    return 0;
+
+/* 9994 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',', F4.1, */
+/*     $      ', C,', I3, ')           .' ) */
+/* 9993 FORMAT(1X, I6, ': ', A12,'(', 2( '''', A1, ''',' ), 2( I3, ',' ), */
+/*     $      '(', F4.1, ',', F4.1, '), A,', I3, ', B,', I3, ',(', F4.1, */
+/*     $      ',', F4.1, '), C,', I3, ')    .' ) */
+
+/*     End of ZCHK5. */
+
+} /* zchk5_ */
+
+
+/* Subroutine */ int zprcn5_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublecomplex* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
+{
+    /* Local variables */
+    static char ca[14], cu[14], crc[14];
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("%d %d (%4.1lf,%4.1lf) , A, %d, B, %d, (%4.1lf,%4.1lf) , C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,beta->r,beta->i,*ldc);
+
+return 0;
+} /* zprcn5_ */
+
+
+
+/* Subroutine */ int zprcn7_(integer* nout, integer* nc, char* sname, integer* iorder, char* uplo, char* transa, integer* n, integer* k, doublecomplex* alpha, integer* lda, integer* ldb, doublereal* beta, integer* ldc, ftnlen sname_len, ftnlen uplo_len, ftnlen transa_len)
+{
+
+    /* Local variables */
+    static char ca[14], cu[14], crc[14];
+
+    if (*(unsigned char *)uplo == 'U') {
+	s_copy(cu, "    CblasUpper", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(cu, "    CblasLower", (ftnlen)14, (ftnlen)14);
+    }
+    if (*(unsigned char *)transa == 'N') {
+	s_copy(ca, "  CblasNoTrans", (ftnlen)14, (ftnlen)14);
+    } else if (*(unsigned char *)transa == 'T') {
+	s_copy(ca, "    CblasTrans", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(ca, "CblasConjTrans", (ftnlen)14, (ftnlen)14);
+    }
+    if (*iorder == 1) {
+	s_copy(crc, " CblasRowMajor", (ftnlen)14, (ftnlen)14);
+    } else {
+	s_copy(crc, " CblasColMajor", (ftnlen)14, (ftnlen)14);
+    }
+    printf("%6d: %s %s %s %s\n",*nc,sname,crc,cu,ca);
+    printf("%d %d (%4.1lf,%4.1lf), A, %d, B, %d, %4.1lf, C, %d.\n",*n,*k,alpha->r,alpha->i,*lda,*ldb,*beta,*ldc);
+
+return 0;
+} /* zprcn7_ */
+
+
+/* Subroutine */ int zmake_(char* type__, char* uplo, char* diag, integer* m, integer* n, doublecomplex* a, integer* nmax, doublecomplex* aa, integer* lda, logical* reset, doublecomplex* transl, ftnlen type_len, ftnlen uplo_len, ftnlen diag_len)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3, i__4;
+    doublereal d__1;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    static integer ibeg, iend;
+    extern /* Double Complex */ VOID zbeg_(doublecomplex*, logical*);
+    static logical unit;
+    static integer i__, j;
+    static logical lower, upper;
+    static integer jj;
+    static logical gen, her, tri, sym;
+
+
+/*  Generates values for an M by N matrix A. */
+/*  Stores the values in the array AA in the data structure required */
+/*  by the routine, with unwanted elements set to rogue value. */
+
+/*  TYPE is 'ge', 'he', 'sy' or 'tr'. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. External Functions .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Executable Statements .. */
+    /* Parameter adjustments */
+    a_dim1 = *nmax;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --aa;
+
+    /* Function Body */
+    gen = s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0;
+    her = s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0;
+    sym = s_cmp(type__, "sy", (ftnlen)2, (ftnlen)2) == 0;
+    tri = s_cmp(type__, "tr", (ftnlen)2, (ftnlen)2) == 0;
+    upper = (her || sym || tri) && *(unsigned char *)uplo == 'U';
+    lower = (her || sym || tri) && *(unsigned char *)uplo == 'L';
+    unit = tri && *(unsigned char *)diag == 'U';
+
+/*     Generate data in array A. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    if (gen || (upper && i__ <= j) || (lower && i__ >= j)) {
+		i__3 = i__ + j * a_dim1;
+		zbeg_(&z__2, reset);
+		z__1.r = z__2.r + transl->r, z__1.i = z__2.i + transl->i;
+		a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+		if (i__ != j) {
+/*                 Set some elements to zero */
+		    if (*n > 3 && j == *n / 2) {
+			i__3 = i__ + j * a_dim1;
+			a[i__3].r = 0., a[i__3].i = 0.;
+		    }
+		    if (her) {
+			i__3 = j + i__ * a_dim1;
+			d_cnjg(&z__1, &a[i__ + j * a_dim1]);
+			a[i__3].r = z__1.r, a[i__3].i = z__1.i;
+		    } else if (sym) {
+			i__3 = j + i__ * a_dim1;
+			i__4 = i__ + j * a_dim1;
+			a[i__3].r = a[i__4].r, a[i__3].i = a[i__4].i;
+		    } else if (tri) {
+			i__3 = j + i__ * a_dim1;
+			a[i__3].r = 0., a[i__3].i = 0.;
+		    }
+		}
+	    }
+/* L10: */
+	}
+	if (her) {
+	    i__2 = j + j * a_dim1;
+	    i__3 = j + j * a_dim1;
+	    d__1 = a[i__3].r;
+	    z__1.r = d__1, z__1.i = 0.;
+	    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+	}
+	if (tri) {
+	    i__2 = j + j * a_dim1;
+	    i__3 = j + j * a_dim1;
+	    z__1.r = a[i__3].r + 1., z__1.i = a[i__3].i + 0.;
+	    a[i__2].r = z__1.r, a[i__2].i = z__1.i;
+	}
+	if (unit) {
+	    i__2 = j + j * a_dim1;
+	    a[i__2].r = 1., a[i__2].i = 0.;
+	}
+/* L20: */
+    }
+
+/*     Store elements in array AS in data structure required by routine. */
+
+    if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *m;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		i__4 = i__ + j * a_dim1;
+		aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i;
+/* L30: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10, aa[i__3].i = 1e10;
+/* L40: */
+	    }
+/* L50: */
+	}
+    } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__,
+	     "sy", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__, "tr", (ftnlen)
+	    2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (upper) {
+		ibeg = 1;
+		if (unit) {
+		    iend = j - 1;
+		} else {
+		    iend = j;
+		}
+	    } else {
+		if (unit) {
+		    ibeg = j + 1;
+		} else {
+		    ibeg = j;
+		}
+		iend = *n;
+	    }
+	    i__2 = ibeg - 1;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10, aa[i__3].i = 1e10;
+/* L60: */
+	    }
+	    i__2 = iend;
+	    for (i__ = ibeg; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		i__4 = i__ + j * a_dim1;
+		aa[i__3].r = a[i__4].r, aa[i__3].i = a[i__4].i;
+/* L70: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = iend + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + (j - 1) * *lda;
+		aa[i__3].r = -1e10, aa[i__3].i = 1e10;
+/* L80: */
+	    }
+	    if (her) {
+		jj = j + (j - 1) * *lda;
+		i__2 = jj;
+		i__3 = jj;
+		d__1 = aa[i__3].r;
+		z__1.r = d__1, z__1.i = -1e10;
+		aa[i__2].r = z__1.r, aa[i__2].i = z__1.i;
+	    }
+/* L90: */
+	}
+    }
+    return 0;
+
+/*     End of ZMAKE. */
+
+} /* zmake_ */
+
+/* Subroutine */ int zmmch_(char* transa, char* transb, integer* m, integer* n, integer* kk, doublecomplex* alpha, doublecomplex* a, integer* lda, doublecomplex* b, integer* ldb, doublecomplex* beta, doublecomplex* c__, integer* ldc, doublecomplex* ct, doublereal* g, doublecomplex* cc, integer* ldcc, doublereal* eps, doublereal* err, logical* fatal, integer* nout, logical* mv, ftnlen transa_len, ftnlen transb_len)
+{
+
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, cc_dim1, 
+	    cc_offset, i__1, i__2, i__3, i__4, i__5, i__6, i__7;
+    doublereal d__1, d__2, d__3, d__4, d__5, d__6;
+    doublecomplex z__1, z__2, z__3, z__4;
+
+    double sqrt(double);
+    /* Local variables */
+    static doublereal erri;
+    static integer i__, j, k;
+    static logical trana, tranb, ctrana, ctranb;
+
+/*  Checks the results of the computational tests. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Parameters .. */
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Statement Functions .. */
+/*     .. Statement Function definitions .. */
+/*     .. Executable Statements .. */
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    c_dim1 = *ldc;
+    c_offset = 1 + c_dim1 * 1;
+    c__ -= c_offset;
+    --ct;
+    --g;
+    cc_dim1 = *ldcc;
+    cc_offset = 1 + cc_dim1 * 1;
+    cc -= cc_offset;
+
+    /* Function Body */
+    trana = *(unsigned char *)transa == 'T' || *(unsigned char *)transa == 
+	    'C';
+    tranb = *(unsigned char *)transb == 'T' || *(unsigned char *)transb == 
+	    'C';
+    ctrana = *(unsigned char *)transa == 'C';
+    ctranb = *(unsigned char *)transb == 'C';
+
+/*     Compute expected result, one column at a time, in CT using data */
+/*     in A, B and C. */
+/*     Compute gauges in G. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    ct[i__3].r = 0., ct[i__3].i = 0.;
+	    g[i__] = 0.;
+/* L10: */
+	}
+	if (! trana && ! tranb) {
+	    i__2 = *kk;
+	    for (k = 1; k <= i__2; ++k) {
+		i__3 = *m;
+		for (i__ = 1; i__ <= i__3; ++i__) {
+		    i__4 = i__;
+		    i__5 = i__;
+		    i__6 = i__ + k * a_dim1;
+		    i__7 = k + j * b_dim1;
+		    z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7].i, 
+			    z__2.i = a[i__6].r * b[i__7].i + a[i__6].i * b[
+			    i__7].r;
+		    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+			    z__2.i;
+		    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+		    i__4 = i__ + k * a_dim1;
+		    i__5 = k + j * b_dim1;
+		    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = d_imag(
+			    &a[i__ + k * a_dim1]), abs(d__2))) * ((d__3 = b[
+			    i__5].r, abs(d__3)) + (d__4 = d_imag(&b[k + j * 
+			    b_dim1]), abs(d__4)));
+/* L20: */
+		}
+/* L30: */
+	    }
+	} else if (trana && ! tranb) {
+	    if (ctrana) {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+			i__6 = k + j * b_dim1;
+			z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, 
+				z__2.i = z__3.r * b[i__6].i + z__3.i * b[i__6]
+				.r;
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+				z__2.i;
+			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			i__4 = k + i__ * a_dim1;
+			i__5 = k + j * b_dim1;
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+				d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * ((
+				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
+				&b[k + j * b_dim1]), abs(d__4)));
+/* L40: */
+		    }
+/* L50: */
+		}
+	    } else {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = k + i__ * a_dim1;
+			i__7 = k + j * b_dim1;
+			z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
+				.i, z__2.i = a[i__6].r * b[i__7].i + a[i__6]
+				.i * b[i__7].r;
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+				z__2.i;
+			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			i__4 = k + i__ * a_dim1;
+			i__5 = k + j * b_dim1;
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+				d_imag(&a[k + i__ * a_dim1]), abs(d__2))) * ((
+				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
+				&b[k + j * b_dim1]), abs(d__4)));
+/* L60: */
+		    }
+/* L70: */
+		}
+	    }
+	} else if (! trana && tranb) {
+	    if (ctranb) {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = i__ + k * a_dim1;
+			d_cnjg(&z__3, &b[j + k * b_dim1]);
+			z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, 
+				z__2.i = a[i__6].r * z__3.i + a[i__6].i * 
+				z__3.r;
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+				z__2.i;
+			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			i__4 = i__ + k * a_dim1;
+			i__5 = j + k * b_dim1;
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+				d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * ((
+				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
+				&b[j + k * b_dim1]), abs(d__4)));
+/* L80: */
+		    }
+/* L90: */
+		}
+	    } else {
+		i__2 = *kk;
+		for (k = 1; k <= i__2; ++k) {
+		    i__3 = *m;
+		    for (i__ = 1; i__ <= i__3; ++i__) {
+			i__4 = i__;
+			i__5 = i__;
+			i__6 = i__ + k * a_dim1;
+			i__7 = j + k * b_dim1;
+			z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[i__7]
+				.i, z__2.i = a[i__6].r * b[i__7].i + a[i__6]
+				.i * b[i__7].r;
+			z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i + 
+				z__2.i;
+			ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			i__4 = i__ + k * a_dim1;
+			i__5 = j + k * b_dim1;
+			g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 = 
+				d_imag(&a[i__ + k * a_dim1]), abs(d__2))) * ((
+				d__3 = b[i__5].r, abs(d__3)) + (d__4 = d_imag(
+				&b[j + k * b_dim1]), abs(d__4)));
+/* L100: */
+		    }
+/* L110: */
+		}
+	    }
+	} else if (trana && tranb) {
+	    if (ctrana) {
+		if (ctranb) {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+			    d_cnjg(&z__4, &b[j + k * b_dim1]);
+			    z__2.r = z__3.r * z__4.r - z__3.i * z__4.i, 
+				    z__2.i = z__3.r * z__4.i + z__3.i * 
+				    z__4.r;
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+				    + z__2.i;
+			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
+				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
+/* L120: */
+			}
+/* L130: */
+		    }
+		} else {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    d_cnjg(&z__3, &a[k + i__ * a_dim1]);
+			    i__6 = j + k * b_dim1;
+			    z__2.r = z__3.r * b[i__6].r - z__3.i * b[i__6].i, 
+				    z__2.i = z__3.r * b[i__6].i + z__3.i * b[
+				    i__6].r;
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+				    + z__2.i;
+			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
+				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
+/* L140: */
+			}
+/* L150: */
+		    }
+		}
+	    } else {
+		if (ctranb) {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    i__6 = k + i__ * a_dim1;
+			    d_cnjg(&z__3, &b[j + k * b_dim1]);
+			    z__2.r = a[i__6].r * z__3.r - a[i__6].i * z__3.i, 
+				    z__2.i = a[i__6].r * z__3.i + a[i__6].i * 
+				    z__3.r;
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+				    + z__2.i;
+			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
+				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
+/* L160: */
+			}
+/* L170: */
+		    }
+		} else {
+		    i__2 = *kk;
+		    for (k = 1; k <= i__2; ++k) {
+			i__3 = *m;
+			for (i__ = 1; i__ <= i__3; ++i__) {
+			    i__4 = i__;
+			    i__5 = i__;
+			    i__6 = k + i__ * a_dim1;
+			    i__7 = j + k * b_dim1;
+			    z__2.r = a[i__6].r * b[i__7].r - a[i__6].i * b[
+				    i__7].i, z__2.i = a[i__6].r * b[i__7].i + 
+				    a[i__6].i * b[i__7].r;
+			    z__1.r = ct[i__5].r + z__2.r, z__1.i = ct[i__5].i 
+				    + z__2.i;
+			    ct[i__4].r = z__1.r, ct[i__4].i = z__1.i;
+			    i__4 = k + i__ * a_dim1;
+			    i__5 = j + k * b_dim1;
+			    g[i__] += ((d__1 = a[i__4].r, abs(d__1)) + (d__2 =
+				     d_imag(&a[k + i__ * a_dim1]), abs(d__2)))
+				     * ((d__3 = b[i__5].r, abs(d__3)) + (d__4 
+				    = d_imag(&b[j + k * b_dim1]), abs(d__4)));
+/* L180: */
+			}
+/* L190: */
+		    }
+		}
+	    }
+	}
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    i__4 = i__;
+	    z__2.r = alpha->r * ct[i__4].r - alpha->i * ct[i__4].i, z__2.i = 
+		    alpha->r * ct[i__4].i + alpha->i * ct[i__4].r;
+	    i__5 = i__ + j * c_dim1;
+	    z__3.r = beta->r * c__[i__5].r - beta->i * c__[i__5].i, z__3.i = 
+		    beta->r * c__[i__5].i + beta->i * c__[i__5].r;
+	    z__1.r = z__2.r + z__3.r, z__1.i = z__2.i + z__3.i;
+	    ct[i__3].r = z__1.r, ct[i__3].i = z__1.i;
+	    i__3 = i__ + j * c_dim1;
+	    g[i__] = ((d__1 = alpha->r, abs(d__1)) + (d__2 = d_imag(alpha), 
+		    abs(d__2))) * g[i__] + ((d__3 = beta->r, abs(d__3)) + (
+		    d__4 = d_imag(beta), abs(d__4))) * ((d__5 = c__[i__3].r, 
+		    abs(d__5)) + (d__6 = d_imag(&c__[i__ + j * c_dim1]), abs(
+		    d__6)));
+/* L200: */
+	}
+
+/*        Compute the error ratio for this result. */
+
+	*err = 0.;
+	i__2 = *m;
+	for (i__ = 1; i__ <= i__2; ++i__) {
+	    i__3 = i__;
+	    i__4 = i__ + j * cc_dim1;
+	    z__2.r = ct[i__3].r - cc[i__4].r, z__2.i = ct[i__3].i - cc[i__4]
+		    .i;
+	    z__1.r = z__2.r, z__1.i = z__2.i;
+	    erri = ((d__1 = z__1.r, abs(d__1)) + (d__2 = d_imag(&z__1), abs(
+		    d__2))) / *eps;
+	    if (g[i__] != 0.) {
+		erri /= g[i__];
+	    }
+	    *err = f2cmax(*err,erri);
+	    if (*err * sqrt(*eps) >= 1.) {
+		goto L230;
+	    }
+/* L210: */
+	}
+
+/* L220: */
+    }
+
+/*     If the loop completes, all results are at least half accurate. */
+    goto L250;
+
+/*     Report fatal error. */
+
+L230:
+    *fatal = TRUE_;
+    printf(" ******* FATAL ERROR - COMPUTED RESULT IS LESS THAN HALF ACCURATE *******\n");
+    printf("         EXPECTED RESULT                    COMPUTED RESULT\n");
+    i__1 = *m;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	if (*mv) {
+            printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,ct[i__].r,ct[i__].i,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i);
+        } else {
+            printf("%7d (%15.6g,%15.6g) (%15.6g,%15.6g)\n",i__,cc[i__+j*cc_dim1].r,cc[i__+j*cc_dim1].i,ct[i__].r,ct[i__].i);
+	}
+/* L240: */
+    }
+    if (*n > 1) {
+        printf("      THESE ARE THE RESULTS FOR COLUMN %d\n",j);
+    }
+
+L250:
+    return 0;
+
+
+/*     End of ZMMCH. */
+
+} /* zmmch_ */
+
+logical lze_(doublecomplex* ri, doublecomplex* rj, integer* lr)
+{
+    /* System generated locals */
+    integer i__1, i__2, i__3;
+    logical ret_val;
+
+    /* Local variables */
+    static integer i__;
+
+
+/*  Tests if two arrays are identical. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Executable Statements .. */
+    /* Parameter adjustments */
+    --rj;
+    --ri;
+
+    /* Function Body */
+    i__1 = *lr;
+    for (i__ = 1; i__ <= i__1; ++i__) {
+	i__2 = i__;
+	i__3 = i__;
+	if (ri[i__2].r != rj[i__3].r || ri[i__2].i != rj[i__3].i) {
+	    goto L20;
+	}
+/* L10: */
+    }
+    ret_val = TRUE_;
+    goto L30;
+L20:
+    ret_val = FALSE_;
+L30:
+    return ret_val;
+
+/*     End of LZE. */
+
+} /* lze_ */
+
+logical lzeres_(char* type__, char* uplo, integer* m, integer* n, doublecomplex *aa, doublecomplex* as, integer* lda, ftnlen type_len, ftnlen uplo_len)
+{
+    /* System generated locals */
+    integer aa_dim1, aa_offset, as_dim1, as_offset, i__1, i__2, i__3, i__4;
+    logical ret_val;
+
+    /* Local variables */
+    static integer ibeg, iend, i__, j;
+    static logical upper;
+
+
+/*  Tests if selected elements in two arrays are equal. */
+
+/*  TYPE is 'ge' or 'he' or 'sy'. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Scalar Arguments .. */
+/*     .. Array Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Executable Statements .. */
+    /* Parameter adjustments */
+    as_dim1 = *lda;
+    as_offset = 1 + as_dim1 * 1;
+    as -= as_offset;
+    aa_dim1 = *lda;
+    aa_offset = 1 + aa_dim1 * 1;
+    aa -= aa_offset;
+
+    /* Function Body */
+    upper = *(unsigned char *)uplo == 'U';
+    if (s_cmp(type__, "ge", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *lda;
+	    for (i__ = *m + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L10: */
+	    }
+/* L20: */
+	}
+    } else if (s_cmp(type__, "he", (ftnlen)2, (ftnlen)2) == 0 || s_cmp(type__,
+	     "sy", (ftnlen)2, (ftnlen)2) == 0) {
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    if (upper) {
+		ibeg = 1;
+		iend = j;
+	    } else {
+		ibeg = j;
+		iend = *n;
+	    }
+	    i__2 = ibeg - 1;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L30: */
+	    }
+	    i__2 = *lda;
+	    for (i__ = iend + 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * aa_dim1;
+		i__4 = i__ + j * as_dim1;
+		if (aa[i__3].r != as[i__4].r || aa[i__3].i != as[i__4].i) {
+		    goto L70;
+		}
+/* L40: */
+	    }
+/* L50: */
+	}
+    }
+
+/*   60 CONTINUE */
+    ret_val = TRUE_;
+    goto L80;
+L70:
+    ret_val = FALSE_;
+L80:
+    return ret_val;
+
+/*     End of LZERES. */
+
+} /* lzeres_ */
+
+/* Double Complex */ VOID zbeg_(doublecomplex* ret_val, logical* reset)
+{
+    /* System generated locals */
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    static integer i__, j, ic, mi, mj;
+
+
+/*  Generates complex numbers as pairs of random numbers uniformly */
+/*  distributed between -0.5 and 0.5. */
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Scalar Arguments .. */
+/*     .. Local Scalars .. */
+/*     .. Save statement .. */
+/*     .. Intrinsic Functions .. */
+/*     .. Executable Statements .. */
+    if (*reset) {
+/*        Initialize local variables. */
+	mi = 891;
+	mj = 457;
+	i__ = 7;
+	j = 7;
+	ic = 0;
+	*reset = FALSE_;
+    }
+
+/*     The sequence of values of I or J is bounded between 1 and 999. */
+/*     If initial I or J = 1,2,3,6,7 or 9, the period will be 50. */
+/*     If initial I or J = 4 or 8, the period will be 25. */
+/*     If initial I or J = 5, the period will be 10. */
+/*     IC is used to break up the period by skipping 1 value of I or J */
+/*     in 6. */
+
+    ++ic;
+L10:
+    i__ *= mi;
+    j *= mj;
+    i__ -= i__ / 1000 * 1000;
+    j -= j / 1000 * 1000;
+    if (ic >= 5) {
+	ic = 0;
+	goto L10;
+    }
+    d__1 = (i__ - 500) / 1001.;
+    d__2 = (j - 500) / 1001.;
+    z__1.r = d__1, z__1.i = d__2;
+     ret_val->r = z__1.r,  ret_val->i = z__1.i;
+    return ;
+
+/*     End of ZBEG. */
+
+} /* zbeg_ */
+
+doublereal ddiff_(doublereal* x, doublereal* y)
+{
+    /* System generated locals */
+    doublereal ret_val;
+
+
+/*  Auxiliary routine for test program for Level 3 Blas. */
+
+/*  -- Written on 8-February-1989. */
+/*     Jack Dongarra, Argonne National Laboratory. */
+/*     Iain Duff, AERE Harwell. */
+/*     Jeremy Du Croz, Numerical Algorithms Group Ltd. */
+/*     Sven Hammarling, Numerical Algorithms Group Ltd. */
+
+/*     .. Scalar Arguments .. */
+/*     .. Executable Statements .. */
+    ret_val = *x - *y;
+    return ret_val;
+
+/*     End of DDIFF. */
+
+} /* ddiff_ */
+
+/* Main program alias */ /*int zblat3_ () { MAIN__ (); }*/
diff --git a/docs/distributing.md b/docs/distributing.md
new file mode 100644
index 000000000..1e6372a28
--- /dev/null
+++ b/docs/distributing.md
@@ -0,0 +1,270 @@
+# Guidance for redistributing OpenBLAS
+
+*We note that this document contains recommendations only - packagers and other
+redistributors are in charge of how OpenBLAS is built and distributed in their
+systems, and may have good reasons to deviate from the guidance given on this
+page. These recommendations are aimed at general packaging systems, with a user
+base that typically is large, open source (or freely available at least), and
+doesn't behave uniformly or that the packager is directly connected with.*
+
+OpenBLAS has a large number of build-time options which can be used to change
+how it behaves at runtime, how artifacts or symbols are named, etc. Variation
+in build configuration can be necessary to acheive a given end goal within a
+distribution or as an end user. However, such variation can also make it more
+difficult to build on top of OpenBLAS and ship code or other packages in a way
+that works across many different distros. Here we provide guidance about the
+most important build options, what effects they may have when changed, and
+which ones to default to.
+
+The Make and CMake build systems provide equivalent options and yield more or
+less the same artifacts, but not exactly (the CMake builds are still
+experimental). You can choose either one and the options will function in the
+same way, however the CMake outputs may require some renaming. To review
+available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of
+the repository.
+
+Build options typically fall into two categories: (a) options that affect the
+user interface, such as library and symbol names or APIs that are made
+available, and (b) options that affect performance and runtime behavior, such
+as threading behavior or CPU architecture-specific code paths. The user
+interface options are more important to keep aligned between distributions,
+while for the performance-related options there are typically more reasons to
+make choices that deviate from the defaults.
+
+Here are recommendations for user interface related packaging choices where it
+is not likely to be a good idea to deviate (typically these are the default
+settings):
+
+1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect
+   binary size much, so don't turn it off.
+2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and
+   while it does make up a significant part of the binary size of the installed
+   library, that does not outweigh the regression in usability when deviating
+   from the default here.[^1]
+3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency
+   detection files. These files are used by build systems when users want to
+   link against OpenBLAS, and there is no benefit of leaving them out.
+4. Provide the LP64 interface by default, and if in addition to that you choose
+   to provide an ILP64 interface build as well, use a symbol suffix to avoid
+   symbol name clashes (see the next section).
+
+[^1] All major distributions do include LAPACK as of mid 2023 as far as we
+know. Older versions of Arch Linux did not, and that was known to cause
+problems.
+
+
+## ILP64 interface builds
+
+The LP64 (32-bit integer) interface is the default build, and has
+well-established C and Fortran APIs as determined by the reference (Netlib)
+BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does
+not have a standard API: symbol names and shared/static library names can be
+produced in multiple ways, and this tends to make it difficult to use.
+As of today there is an agreed-upon way of choosing names for OpenBLAS between
+a number of key users/redistributors, which is the closest thing to a standard
+that there is now. However, there is an ongoing standardization effort in the
+reference BLAS and LAPACK libraries, which differs from the current OpenBLAS
+agreed-upon convention. In this section we'll aim to explain both.
+
+Those two methods are fairly similar, and have a key thing in common: *using a
+symbol suffix*. This is good practice; it is recommended that if you distribute
+an ILP64 build, to have it use a symbol suffix containing `64` in the name.
+This avoids potential symbol clashes when different packages which depend on
+OpenBLAS load both an LP64 and an ILP64 library into memory at the same time.
+
+### The current OpenBLAS agreed-upon ILP64 convention
+
+This convention comprises the shared library name and the symbol suffix in the
+shared library. The symbol suffix to use is `64_`, implying that the library
+name will be `libopenblas64_.so` and the symbols in that library end in `64_`.
+The central issue where this was discussed is
+[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters
+include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well.
+
+To build shared and static libraries with the currently recommended ILP64
+conventions with Make:
+```bash
+$ make INTERFACE64=1 SYMBOLSUFFIX=64_
+```
+
+This will produce libraries named `libopenblas64_.so|a`, a pkg-config file
+named `openblas64.pc`, and CMake and header files.
+
+Installing locally and inspecting the output will show a few more details:
+```bash
+$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_
+$ tree .  # output slightly edited down
+.
+├── include
+│   ├── cblas.h
+│   ├── f77blas.h
+│   ├── lapacke_config.h
+│   ├── lapacke.h
+│   ├── lapacke_mangling.h
+│   ├── lapacke_utils.h
+│   ├── lapack.h
+│   └── openblas_config.h
+└── lib
+    ├── cmake
+    │   └── openblas
+    │       ├── OpenBLASConfig.cmake
+    │       └── OpenBLASConfigVersion.cmake
+    ├── libopenblas64_.a
+    ├── libopenblas64_.so
+    └── pkgconfig
+        └── openblas64.pc
+```
+
+A key point are the symbol names. These will equal the LP64 symbol names, then
+(for Fortran only) the compiler mangling, and then the `64_` symbol suffix.
+Hence to obtain the final symbol names, we need to take into account which
+Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel
+Fortran, or Flang), that means appending a single underscore. In that case, the
+result is:
+
+| base API name | binary symbol name | call from Fortran code | call from C code      |
+|---------------|--------------------|------------------------|-----------------------|
+| `dgemm`       | `dgemm_64_`        | `dgemm_64(...)`        | `dgemm_64_(...)`      |
+| `cblas_dgemm` | `cblas_dgemm64_`   | n/a                    | `cblas_dgemm64_(...)` |
+
+It is quite useful to have these symbol names be as uniform as possible across
+different packaging systems.
+
+The equivalent build options with CMake are:
+```bash
+$ mkdir build && cd build
+$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
+$ cmake --build . -j
+```
+
+Note that the result is not 100% identical to the Make result. For example, the
+library name ends in `_64` rather than `64_` - it is recommended to rename them
+to match the Make library names (also update the `libsuffix` entry in
+`openblas64.pc` to match that rename).
+```bash
+$ cmake --install . --prefix $PWD/../../openblas/cmake64
+$ tree .
+.
+├── include
+│   └── openblas64
+│       ├── cblas.h
+│       ├── f77blas.h
+│       ├── lapacke_config.h
+│       ├── lapacke_example_aux.h
+│       ├── lapacke.h
+│       ├── lapacke_mangling.h
+│       ├── lapacke_utils.h
+│       ├── lapack.h
+│       ├── openblas64
+│       │   └── lapacke_mangling.h
+│       └── openblas_config.h
+└── lib
+    ├── cmake
+    │   └── OpenBLAS64
+    │       ├── OpenBLAS64Config.cmake
+    │       ├── OpenBLAS64ConfigVersion.cmake
+    │       ├── OpenBLAS64Targets.cmake
+    │       └── OpenBLAS64Targets-noconfig.cmake
+    ├── libopenblas_64.a
+    ├── libopenblas_64.so -> libopenblas_64.so.0
+    └── pkgconfig
+        └── openblas64.pc
+```
+
+
+### The upcoming standardized ILP64 convention
+
+While the `64_` convention above got some adoption, it's slightly hacky and is
+implemented through the use of `objcopy`. An effort is ongoing for a more
+broadly adopted convention in the reference BLAS and LAPACK libraries, using
+(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after
+Fortran compiler mangling. The central issue for this is
+[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666).
+
+For the most common cases of compiler mangling (a single `_` appended), the end
+result will be:
+
+| base API name | binary symbol name | call from Fortran code | call from C code      |
+|---------------|--------------------|------------------------|-----------------------|
+| `dgemm`       | `dgemm_64_`        | `dgemm_64(...)`        | `dgemm_64_(...)`      |
+| `cblas_dgemm` | `cblas_dgemm_64`   | n/a                    | `cblas_dgemm_64(...)` |
+
+For other compiler mangling schemes, replace the trailing `_` by the scheme in use.
+
+The shared library name for this `_64` convention should be `libopenblas_64.so`.
+
+Note: it is not yet possible to produce an OpenBLAS build which employs this
+convention! Once reference BLAS and LAPACK with support for `_64` have been
+released, a future OpenBLAS release will support it. For now, please use the
+older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be
+considered reserved for future use of the `_64` standard as prescribed by
+reference BLAS/LAPACK.
+
+
+## Performance and runtime behavior related build options
+
+For these options there are multiple reasonable or common choices.
+
+### Threading related options
+
+OpenBLAS can be built as a multi-threaded or single-threaded library, with the
+default being multi-threaded. It's expected that the default `libopenblas`
+library is multi-threaded; if you'd like to also distribute single-threaded
+builds, consider naming them `libopenblas_sequential`.
+
+OpenBLAS can be built with pthreads or OpenMP as the threading model, with the
+default being pthreads. Both options are commonly used, and the choice here
+should not influence the shared library name. The choice will be captured by
+the `.pc` file. E.g.,:
+```bash
+$ pkg-config --libs openblas
+-fopenmp -lopenblas
+
+$ cat openblas.pc
+...
+openblas_config= ... USE_OPENMP=0 MAX_THREADS=24
+```
+
+The maximum number of threads users will be able to use is determined at build
+time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide
+range of values that are reasonable to use (up to 256). 64 is a typical choice
+here; there is a memory footprint penalty that is linear in `NUM_THREADS`.
+Please see `Makefile.rule` for more details.
+
+### CPU architecture related options
+
+OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when
+distributing to a user base with a variety of hardware, it is recommended to
+enable CPU architecture runtime detection. This will dynamically select
+optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1`
+build option. This is usually done on all common CPU families, except when
+there are known issues.
+
+In case the CPU architecture is known (e.g. you're building binaries for macOS
+M1 users), it is possible to specify the target architecture directly with the
+`TARGET=` build option.
+
+`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md`
+in this repository.
+
+
+## Real-world examples
+
+OpenBLAS is likely to be distributed in one of these distribution models:
+
+1. As a standalone package, or multiple packages, in a packaging ecosystem like
+   a Linux distro, Homebrew, conda-forge or MSYS2.
+2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R.
+3. Locally, e.g. making available as a build on a single HPC cluster.
+
+The guidance on this page is most important for models (1) and (2). These links
+to build recipes for a representative selection of packaging systems may be
+helpful as a reference:
+
+- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec)
+- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules)
+- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb)
+- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD)
+- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh)
+- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh)
+- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix)
diff --git a/driver/level3/gemm_thread_m.c b/driver/level3/gemm_thread_m.c
index 8813e5529..353ae0be9 100644
--- a/driver/level3/gemm_thread_m.c
+++ b/driver/level3/gemm_thread_m.c
@@ -40,7 +40,7 @@
 #include <stdlib.h>
 #include "common.h"
 
-int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {
+int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG nthreads) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range[MAX_CPU_NUMBER + 1];
diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c
index 6b52df884..4f370999a 100644
--- a/driver/level3/gemm_thread_mn.c
+++ b/driver/level3/gemm_thread_mn.c
@@ -60,7 +60,7 @@ static const int divide_rule[][2] =
    { 1, 61}, { 2, 31}, { 7,  9}, { 8,  8},
 };
 
-int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {
+int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG nthreads) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
 
diff --git a/driver/level3/gemm_thread_n.c b/driver/level3/gemm_thread_n.c
index 9668841bb..d583456bd 100644
--- a/driver/level3/gemm_thread_n.c
+++ b/driver/level3/gemm_thread_n.c
@@ -40,7 +40,7 @@
 #include <stdlib.h>
 #include "common.h"
 
-int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {
+int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG), void *sa, void *sb, BLASLONG nthreads) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range[MAX_CPU_NUMBER + 1];
diff --git a/driver/level3/gemm_thread_variable.c b/driver/level3/gemm_thread_variable.c
index 162a75f70..75e49cb1a 100644
--- a/driver/level3/gemm_thread_variable.c
+++ b/driver/level3/gemm_thread_variable.c
@@ -42,7 +42,7 @@
 
 int CNAME(int mode,
 	  blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
-	  int (*function)(), void *sa, void *sb, BLASLONG divM, BLASLONG divN) {
+	  int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT *, FLOAT *, BLASLONG ), void *sa, void *sb, BLASLONG divM, BLASLONG divN) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
 
diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c
index 39824fc5a..26d07fa94 100644
--- a/driver/level3/level3_gemm3m_thread.c
+++ b/driver/level3/level3_gemm3m_thread.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -44,10 +45,6 @@
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 //The array of job_t may overflow the stack.
 //Instead, use malloc to alloc job_t.
 #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -1015,6 +1012,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   BLASLONG divN, divT;
   int mode;
 
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
+
   if (range_m) {
     BLASLONG m_from = *(((BLASLONG *)range_m) + 0);
     BLASLONG m_to   = *(((BLASLONG *)range_m) + 1);
@@ -1030,7 +1033,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   }
 */
 
-  if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
+  if ((args -> m < nthreads * switch_ratio) || (args -> n < nthreads * switch_ratio)) {
     GEMM3M_LOCAL(args, range_m, range_n, sa, sb, 0);
     return 0;
   }
@@ -1038,7 +1041,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   divT = nthreads;
   divN = 1;
 
-  while ((GEMM3M_P * divT > m * SWITCH_RATIO) && (divT > 1)) {
+  while ((GEMM3M_P * divT > m * switch_ratio) && (divT > 1)) {
     do {
       divT --;
       divN = 1;
diff --git a/driver/level3/level3_syrk_threaded.c b/driver/level3/level3_syrk_threaded.c
index d7dcd68a3..b03577fb3 100644
--- a/driver/level3/level3_syrk_threaded.c
+++ b/driver/level3/level3_syrk_threaded.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -44,10 +45,6 @@
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 //The array of job_t may overflow the stack.
 //Instead, use malloc to alloc job_t.
 #if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
@@ -528,7 +525,13 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
   int  mode, mask;
   double dnum, di, dinum;
 
-  if ((nthreads  == 1) || (args -> n < nthreads * SWITCH_RATIO)) {
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
+
+  if ((nthreads == 1) || (args->n < nthreads * switch_ratio)) {
     SYRK_LOCAL(args, range_m, range_n, sa, sb, 0);
     return 0;
   }
diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index ff32a74a9..c7ccf8426 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -44,10 +45,6 @@
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 #ifndef GEMM_PREFERED_SIZE
 #define GEMM_PREFERED_SIZE 1
 #endif
@@ -595,6 +592,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
   BLASLONG width, i, j, k, js;
   BLASLONG m, n, n_from, n_to;
   int mode;
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
 
   /* Get execution mode */
 #ifndef COMPLEX
@@ -732,8 +734,8 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
     num_parts  = 0;
     while (n > 0){
       width = blas_quickdivide(n + nthreads - num_parts - 1, nthreads - num_parts);
-      if (width < SWITCH_RATIO) {
-        width = SWITCH_RATIO;
+      if (width < switch_ratio) {
+        width = switch_ratio;
       }
       width = round_up(n, width, GEMM_PREFERED_SIZE);
 
@@ -792,6 +794,11 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
   BLASLONG m = args -> m;
   BLASLONG n = args -> n;
   BLASLONG nthreads_m, nthreads_n;
+#if defined(DYNAMIC_ARCH)
+  int switch_ratio = gotoblas->switch_ratio;
+#else
+  int switch_ratio = SWITCH_RATIO;
+#endif
 
   /* Get dimensions from index ranges if available */
   if (range_m) {
@@ -801,21 +808,21 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IF
     n = range_n[1] - range_n[0];
   }
 
-  /* Partitions in m should have at least SWITCH_RATIO rows */
-  if (m < 2 * SWITCH_RATIO) {
+  /* Partitions in m should have at least switch_ratio rows */
+  if (m < 2 * switch_ratio) {
     nthreads_m = 1;
   } else {
     nthreads_m = args -> nthreads;
-    while (m < nthreads_m * SWITCH_RATIO) {
+    while (m < nthreads_m * switch_ratio) {
       nthreads_m = nthreads_m / 2;
     }
   }
 
-  /* Partitions in n should have at most SWITCH_RATIO * nthreads_m columns */
-  if (n < SWITCH_RATIO * nthreads_m) {
+  /* Partitions in n should have at most switch_ratio * nthreads_m columns */
+  if (n < switch_ratio * nthreads_m) {
     nthreads_n = 1;
   } else {
-    nthreads_n = (n + SWITCH_RATIO * nthreads_m - 1) / (SWITCH_RATIO * nthreads_m);
+    nthreads_n = (n + switch_ratio * nthreads_m - 1) / (switch_ratio * nthreads_m);
     if (nthreads_m * nthreads_n > args -> nthreads) {
       nthreads_n = blas_quickdivide(args -> nthreads, nthreads_m);
     }
diff --git a/driver/level3/syrk_thread.c b/driver/level3/syrk_thread.c
index 12808afd5..a40122e38 100644
--- a/driver/level3/syrk_thread.c
+++ b/driver/level3/syrk_thread.c
@@ -41,7 +41,7 @@
 #include <math.h>
 #include "common.h"
 
-int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(), void *sa, void *sb, BLASLONG nthreads) {
+int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (*function)(blas_arg_t*, BLASLONG*, BLASLONG*, FLOAT *, FLOAT *, BLASLONG), void *sa, void *sb, BLASLONG nthreads) {
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   BLASLONG range[MAX_CPU_NUMBER + 1];
diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c
index 06039c952..01b254f5d 100644
--- a/driver/others/blas_l1_thread.c
+++ b/driver/others/blas_l1_thread.c
@@ -43,7 +43,7 @@
 int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
 		       void *a, BLASLONG lda,
 		       void *b, BLASLONG ldb,
-		       void *c, BLASLONG ldc, int (*function)(), int nthreads){
+		       void *c, BLASLONG ldc, int (*function)(void), int nthreads){
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   blas_arg_t   args [MAX_CPU_NUMBER];
@@ -141,7 +141,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
 int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
 		       void *a, BLASLONG lda,
 		       void *b, BLASLONG ldb,
-		       void *c, BLASLONG ldc, int (*function)(), int nthreads){
+		       void *c, BLASLONG ldc, int (*function)(void), int nthreads){
 
   blas_queue_t queue[MAX_CPU_NUMBER];
   blas_arg_t   args [MAX_CPU_NUMBER];
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index 051513f27..2531c57e9 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -93,7 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-extern unsigned int openblas_thread_timeout();
+extern unsigned int openblas_thread_timeout(void);
 
 #ifdef SMP_SERVER
 
@@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout();
 /* We need this global for checking if initialization is finished.  */
 int blas_server_avail   __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
 
+int blas_omp_threads_local = 1;
+
 /* Local Variables */
 #if   defined(USE_PTHREAD_LOCK)
 static pthread_mutex_t  server_lock    = PTHREAD_MUTEX_INITIALIZER;
@@ -973,7 +975,7 @@ void goto_set_num_threads(int num_threads) {
 
     increased_threads = 1;
 
-    for(i = blas_num_threads - 1; i < num_threads - 1; i++){
+    for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
 
       atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
       thread_status[i].status = THREAD_STATUS_WAKEUP;
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index bcd9c29b5..23110f807 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -68,8 +68,10 @@
 #endif
 
 int blas_server_avail = 0;
+int blas_omp_number_max = 0;
+int blas_omp_threads_local = 1;
 
-extern int openblas_omp_adaptive_env();
+extern int openblas_omp_adaptive_env(void);
 
 static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
 #ifdef HAVE_C11
@@ -78,7 +80,7 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 #endif
 
-static void adjust_thread_buffers() {
+static void adjust_thread_buffers(void) {
 
   int i=0, j=0;
 
@@ -100,8 +102,6 @@ static void adjust_thread_buffers() {
 
 void goto_set_num_threads(int num_threads) {
 
-  blas_num_threads_set = 1;
-  if (num_threads < 0) blas_num_threads_set = 0;
   if (num_threads < 1) num_threads = blas_num_threads;
 
   if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@@ -126,6 +126,17 @@ void openblas_set_num_threads(int num_threads) {
 
 int blas_thread_init(void){
 
+#if defined(__FreeBSD__) && defined(__clang__)
+extern int openblas_omp_num_threads_env(void);
+
+   if(blas_omp_number_max <= 0)
+	   blas_omp_number_max= openblas_omp_num_threads_env();
+   if (blas_omp_number_max <= 0) 
+	   blas_omp_number_max=MAX_CPU_NUMBER;
+#else
+    blas_omp_number_max = omp_get_max_threads();
+#endif
+
   blas_get_cpu_number();
 
   adjust_thread_buffers();
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index afa33cccc..2ad8b8c5f 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -48,34 +48,38 @@
 #endif
 #endif
 
+#ifdef SMP_DEBUG
+#   define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
+#else
+#   define MT_TRACE(...)
+#endif
+
 /* This is a thread implementation for Win32 lazy implementation */
 
 /* Thread server common information */
-typedef struct{
-  CRITICAL_SECTION lock;
-  HANDLE filled;
-  HANDLE killed;
-
-  blas_queue_t	*queue;    /* Parameter Pointer */
-  int		shutdown;  /* server shutdown flag */
 
-} blas_pool_t;
+static blas_queue_t *work_queue = NULL;
+static HANDLE kickoff_event = NULL;
+static CRITICAL_SECTION queue_lock;
 
 /* We need this global for checking if initialization is finished.   */
 int blas_server_avail = 0;
 
+int blas_omp_threads_local = 1;
+
 /* Local Variables */
 static BLASULONG server_lock       = 0;
 
-static blas_pool_t   pool;
 static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
+static volatile int thread_target;	// target num of live threads, volatile for cross-thread reads
 
+//
+// Legacy code path
+//
+static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
 
-
-static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
-
-      if (!(mode & BLAS_COMPLEX)){
+      if (!(mode & BLAS_COMPLEX)) {
 #ifdef EXPRECISION
 	if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
 	  /* REAL / Extended Double */
@@ -90,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		args -> c, args -> ldc, sb);
 	} else
 #endif
-	  if ((mode & BLAS_PREC) == BLAS_DOUBLE){
+	  if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
 	    /* REAL / Double */
 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
 			  double *, BLASLONG, double *, BLASLONG,
@@ -101,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		  args -> a, args -> lda,
 		  args -> b, args -> ldb,
 		  args -> c, args -> ldc, sb);
-	  } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
+	  } else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
 	    /* REAL / Single */
 	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
 			  float *, BLASLONG, float *, BLASLONG,
@@ -113,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		  args -> b, args -> ldb,
 		  args -> c, args -> ldc, sb);
 #ifdef BUILD_BFLOAT16
-          } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
+          } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) {
             /* REAL / BFLOAT16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
                           bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
@@ -124,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
                   args -> a, args -> lda,
                   args -> b, args -> ldb,
                   args -> c, args -> ldc, sb);
-          } else if ((mode & BLAS_PREC) == BLAS_STOBF16){
+          } else if ((mode & BLAS_PREC) == BLAS_STOBF16) {
             /* REAL / BLAS_STOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
                           float *, BLASLONG, bfloat16 *, BLASLONG,
@@ -135,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
                   args -> a, args -> lda,
                   args -> b, args -> ldb,
                   args -> c, args -> ldc, sb);
-          } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
+          } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) {
             /* REAL / BLAS_DTOBF16 */
             void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
                           double *, BLASLONG, bfloat16 *, BLASLONG,
@@ -152,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 	  }
       } else {
 #ifdef EXPRECISION
-	if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
+	if ((mode & BLAS_PREC) == BLAS_XDOUBLE) {
 	  /* COMPLEX / Extended Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
 			xdouble *, BLASLONG, xdouble *, BLASLONG,
@@ -166,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 		args -> c, args -> ldc, sb);
 	} else
 #endif
-	  if ((mode & BLAS_PREC) == BLAS_DOUBLE){
+	  if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
 	    /* COMPLEX / Double */
 	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
 			double *, BLASLONG, double *, BLASLONG,
@@ -196,88 +200,78 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
       }
 }
 
-/* This is a main routine of threads. Each thread waits until job is */
-/* queued.                                                           */
-
-static DWORD WINAPI blas_thread_server(void *arg){
+//
+// This is a main routine of threads. Each thread waits until job is queued.
+//
+static DWORD WINAPI blas_thread_server(void *arg) {
 
   /* Thread identifier */
-#ifdef SMP_DEBUG
   BLASLONG  cpu = (BLASLONG)arg;
-#endif
 
   void *buffer, *sa, *sb;
   blas_queue_t	*queue;
-  DWORD action;
-  HANDLE handles[] = {pool.filled, pool.killed};
 
   /* Each server needs each buffer */
   buffer   = blas_memory_alloc(2);
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
-#endif
+  MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
 
-  while (1){
+  while (1) {
 
     /* Waiting for Queue */
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
-#endif
+    MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
 
-    do {
-      action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
-    } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
+    // event raised when work is added to the queue
+    WaitForSingleObject(kickoff_event, INFINITE);
 
-    if (action == WAIT_OBJECT_0 + 1) break;
+    if (cpu > thread_target - 2) {
+      //MT_TRACE("thread [%d] exiting.\n", cpu);
+      break;	// excess thread, so worker thread exits
+    }
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
-#endif
+    MT_TRACE("Server[%2ld] Got it.\n", cpu);
 
-    EnterCriticalSection(&pool.lock);
+    EnterCriticalSection(&queue_lock);
 
-    queue = pool.queue;
-    if (queue) pool.queue = queue->next;
+    queue = work_queue;
+    if (queue)
+        work_queue = work_queue->next;
 
-    LeaveCriticalSection(&pool.lock);
+    LeaveCriticalSection(&queue_lock);
 
-    if (queue)  {
+    if (queue) {
       int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
 
-      if (pool.queue) SetEvent(pool.filled);
-
       sa = queue -> sa;
       sb = queue -> sb;
 
-#ifdef CONSISTENT_FPCSR
-      __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
-      __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
-#endif
+      #ifdef CONSISTENT_FPCSR
+        __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
+        __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
+      #endif
 
-#ifdef SMP_DEBUG
-      fprintf(STDERR, "Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
+      MT_TRACE("Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
 	      cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
-#endif
 
       // fprintf(stderr, "queue start[%ld]!!!\n", cpu);
 
-#ifdef MONITOR
-      main_status[cpu] = MAIN_RUNNING1;
-#endif
+      #ifdef MONITOR
+        main_status[cpu] = MAIN_RUNNING1;
+      #endif
 
-      if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
+      if (sa == NULL) 
+        sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
 
       if (sb == NULL) {
-	if (!(queue -> mode & BLAS_COMPLEX)){
+        if (!(queue -> mode & BLAS_COMPLEX)) {
 #ifdef EXPRECISION
-	  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
+	  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) {
 	    sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
 					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 	  } else
 #endif
-	    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
+	    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
 #ifdef BUILD_DOUBLE
 	      sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
 					  + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
@@ -311,70 +305,59 @@ static DWORD WINAPI blas_thread_server(void *arg){
             /* Other types in future */
 	    }
 	}
-	queue->sb=sb;
+      	queue->sb=sb;
       }
 
-#ifdef MONITOR
-      main_status[cpu] = MAIN_RUNNING2;
-#endif
+      #ifdef MONITOR
+        main_status[cpu] = MAIN_RUNNING2;
+      #endif
 
       if (!(queue -> mode & BLAS_LEGACY)) {
-
-	(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
+      	(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
       } else {
-	legacy_exec(routine, queue -> mode, queue -> args, sb);
+  	    legacy_exec(routine, queue -> mode, queue -> args, sb);
       }
-    }else{
-		continue; //if queue == NULL
-	}
-
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
-#endif
-
-    EnterCriticalSection(&queue->lock);
-
-    queue -> status = BLAS_STATUS_FINISHED;
-
-    LeaveCriticalSection(&queue->lock);
+    } else {
+  		continue; //if queue == NULL
+	  }
 
-    SetEvent(queue->finish);
+    MT_TRACE("Server[%2ld] Finished!\n", cpu);
+	
+	  queue->finished = 1;
   }
 
   /* Shutdown procedure */
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "Server[%2ld] Shutdown!\n",  cpu);
-#endif
+  MT_TRACE("Server[%2ld] Shutdown!\n",  cpu);
 
   blas_memory_free(buffer);
 
   return 0;
-  }
+}
 
-/* Initializing routine */
-int blas_thread_init(void){
+//
+// Initializing routine
+//
+int blas_thread_init(void) {
   BLASLONG i;
 
   if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
 
   LOCK_COMMAND(&server_lock);
 
-#ifdef SMP_DEBUG
-  fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
-	  blas_cpu_number);
-#endif
+  MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
 
-  if (!blas_server_avail){
+  if (!blas_server_avail) {
+    // create the kickoff Event
+    kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
-    InitializeCriticalSection(&pool.lock);
-    pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
-    pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
+    thread_target = blas_cpu_number;
 
-    pool.shutdown = 0;
-    pool.queue    = NULL;
+    InitializeCriticalSection(&queue_lock);
+
+    for(i = 0; i < blas_cpu_number - 1; i++) {
+	    //MT_TRACE("thread_init: creating thread [%d]\n", i);
 
-    for(i = 0; i < blas_cpu_number - 1; i++){
       blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
 				     0, &blas_threads_id[i]);
@@ -388,15 +371,12 @@ int blas_thread_init(void){
   return 0;
 }
 
-/*
-   User can call one of two routines.
-
-     exec_blas_async ... immediately returns after jobs are queued.
-
-     exec_blas       ... returns after jobs are finished.
-*/
-
-int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
+//
+//   User can call one of two routines.
+//     exec_blas_async ... immediately returns after jobs are queued.
+//     exec_blas       ... returns after jobs are finished.
+//
+int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
 
 #if defined(SMP_SERVER)
   // Handle lazy re-init of the thread-pool after a POSIX fork
@@ -409,8 +389,6 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
   current = queue;
 
   while (current) {
-    InitializeCriticalSection(&current -> lock);
-    current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
     current -> position = pos;
 
 #ifdef CONSISTENT_FPCSR
@@ -418,56 +396,71 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
     __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
 #endif
 
+  	current->finished = 0;
     current = current -> next;
     pos ++;
   }
 
-  EnterCriticalSection(&pool.lock);
+  EnterCriticalSection(&queue_lock);
 
-  if (pool.queue) {
-    current = pool.queue;
-    while (current -> next) current = current -> next;
-    current -> next = queue;
-  } else {
-    pool.queue = queue;
+  if (!work_queue)
+  {
+    work_queue = queue;
+  }
+  else
+  {
+	  blas_queue_t *next_item = work_queue;
+
+    // find the end of the work queue
+    while (next_item)
+        next_item = next_item->next;
+
+    // add new work to the end
+    next_item = queue;
   }
 
-  LeaveCriticalSection(&pool.lock);
+  LeaveCriticalSection(&queue_lock);
 
-  SetEvent(pool.filled);
+  SetEvent(kickoff_event);
 
   return 0;
 }
 
-int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
+//
+// Join. Wait for all queued tasks to complete
+//
+int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Synchronization Waiting.\n");
-#endif
+  MT_TRACE("Synchronization Waiting.\n");
 
-    while (num){
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Waiting Queue ..\n");
-#endif
+  while (num) {
+    MT_TRACE("Waiting Queue ..\n");
+
+    while (!queue->finished)
+      YIELDING;
 
-      WaitForSingleObject(queue->finish, INFINITE);
+    queue = queue->next;
+    num--;
+  }
 
-      CloseHandle(queue->finish);
-      DeleteCriticalSection(&queue -> lock);
+  MT_TRACE("Completely Done.\n\n");
 
-      queue = queue -> next;
-      num --;
-    }
+	// if work was added to the queue after this batch we can't sleep the worker threads
+	// by resetting the event
+	EnterCriticalSection(&queue_lock);
 
-#ifdef SMP_DEBUG
-    fprintf(STDERR, "Completely Done.\n\n");
-#endif
+	if (work_queue == NULL)
+		ResetEvent(kickoff_event);
 
-  return 0;
+	LeaveCriticalSection(&queue_lock);
+
+	return 0;
 }
 
-/* Execute Threads */
-int exec_blas(BLASLONG num, blas_queue_t *queue){
+//
+// Execute Threads
+//
+int exec_blas(BLASLONG num, blas_queue_t *queue) {
 
 #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
   // Handle lazy re-init of the thread-pool after a POSIX fork
@@ -480,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
 
   if ((num <= 0) || (queue == NULL)) return 0;
 
-  if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
+  if ((num > 1) && queue -> next) 
+    exec_blas_async(1, queue -> next);
 
   routine = queue -> routine;
 
   if (queue -> mode & BLAS_LEGACY) {
     legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
-  } else
+  } else {
     if (queue -> mode & BLAS_PTHREAD) {
       void (*pthreadcompat)(void *) = queue -> routine;
       (pthreadcompat)(queue -> args);
     } else
       (routine)(queue -> args, queue -> range_m, queue -> range_n,
-		queue -> sa, queue -> sb, 0);
+    		queue -> sa, queue -> sb, 0);
+  }
 
-  if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
+  if ((num > 1) && queue -> next) 
+    exec_blas_async_wait(num - 1, queue -> next);
 
   return 0;
 }
 
-/* Shutdown procedure, but user don't have to call this routine. The */
-/* kernel automatically kill threads.                                */
-
-int BLASFUNC(blas_thread_shutdown)(void){
+//
+// Shutdown procedure, but user don't have to call this routine. The
+// kernel automatically kill threads.
+//
+int BLASFUNC(blas_thread_shutdown)(void) {
 
   int i;
 
@@ -510,11 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
 
   LOCK_COMMAND(&server_lock);
 
-  if (blas_server_avail){
-
-    SetEvent(pool.killed);
+  if (blas_server_avail) {
 
-    for(i = 0; i < blas_num_threads - 1; i++){
+    for (i = 0; i < blas_num_threads - 1; i++) {
       // Could also just use WaitForMultipleObjects
       DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
 
@@ -528,9 +523,6 @@ int BLASFUNC(blas_thread_shutdown)(void){
       CloseHandle(blas_threads[i]);
     }
 
-    CloseHandle(pool.filled);
-    CloseHandle(pool.killed);
-
     blas_server_avail = 0;
   }
 
@@ -539,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
   return 0;
 }
 
+//
+// Legacy function to set numbef of threads
+//
 void goto_set_num_threads(int num_threads)
 {
 	long i;
@@ -552,23 +547,48 @@ void goto_set_num_threads(int num_threads)
 
 	if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
 
+	if (blas_server_avail && num_threads < blas_num_threads) {
+		LOCK_COMMAND(&server_lock);
+
+		thread_target = num_threads;
+		
+		SetEvent(kickoff_event);
+
+		for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
+			//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
+
+			WaitForSingleObject(blas_threads[i], INFINITE);
+
+			//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
+
+			CloseHandle(blas_threads[i]);
+		}
+
+		blas_num_threads = num_threads;
+		
+		ResetEvent(kickoff_event);
+
+		UNLOCK_COMMAND(&server_lock);
+	}
+
 	if (num_threads > blas_num_threads) {
 
 		LOCK_COMMAND(&server_lock);
 
-		//increased_threads = 1;
-	    if (!blas_server_avail){
+		thread_target = num_threads;
+
+		  //increased_threads = 1;
+	    if (!blas_server_avail) {
+			// create the kickoff Event
+			kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
 
-			InitializeCriticalSection(&pool.lock);
-			pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
-			pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
+			InitializeCriticalSection(&queue_lock);
 
-			pool.shutdown = 0;
-			pool.queue    = NULL;
 			blas_server_avail = 1;
 		}
 
-		for(i = blas_num_threads - 1; i < num_threads - 1; i++){
+		for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
+			//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
 
 			blas_threads[i] = CreateThread(NULL, 0,
 				     blas_thread_server, (void *)i,
@@ -583,6 +603,9 @@ void goto_set_num_threads(int num_threads)
 	blas_cpu_number  = num_threads;
 }
 
+//
+// Openblas function to set thread count
+//
 void openblas_set_num_threads(int num)
 {
 	goto_set_num_threads(num);
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index f61930983..e3f905265 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE;
 #else
 #define gotoblas_COOPERLAKE gotoblas_PRESCOTT
 #endif
+#ifdef DYN_SAPPHIRERAPIDS
+extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
+#elif defined(DYN_SKYLAKEX)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX
+#elif defined(DYN_HASWELL)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
+#else
+#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT
+#endif
 
 
 #else // not DYNAMIC_LIST
@@ -262,15 +275,18 @@ extern gotoblas_t  gotoblas_EXCAVATOR;
 #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
 #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
 #define gotoblas_ZEN gotoblas_SANDYBRIDGE
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
 #else
 extern gotoblas_t  gotoblas_HASWELL;
 extern gotoblas_t  gotoblas_ZEN;
 #ifndef NO_AVX512
 extern gotoblas_t  gotoblas_SKYLAKEX;
 extern gotoblas_t  gotoblas_COOPERLAKE;
+extern gotoblas_t  gotoblas_SAPPHIRERAPIDS;
 #else
 #define gotoblas_SKYLAKEX gotoblas_HASWELL
 #define gotoblas_COOPERLAKE gotoblas_HASWELL
+#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
 #endif
 #endif
 #else
@@ -279,6 +295,7 @@ extern gotoblas_t  gotoblas_COOPERLAKE;
 #define gotoblas_HASWELL gotoblas_NEHALEM
 #define gotoblas_SKYLAKEX gotoblas_NEHALEM
 #define gotoblas_COOPERLAKE gotoblas_NEHALEM
+#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
 #define gotoblas_BULLDOZER gotoblas_BARCELONA
 #define gotoblas_PILEDRIVER gotoblas_BARCELONA
 #define gotoblas_STEAMROLLER gotoblas_BARCELONA
@@ -378,6 +395,31 @@ int support_avx512_bf16(){
 #endif
 }
 
+#define BIT_AMX_TILE	0x01000000
+#define BIT_AMX_BF16	0x00400000
+#define BIT_AMX_ENBD	0x00060000
+
+int support_amx_bf16() {
+#if !defined(NO_AVX) && !defined(NO_AVX512)
+  int eax, ebx, ecx, edx;
+  int ret=0;
+
+  if (!support_avx512())
+    return 0;
+  // CPUID.7.0:EDX indicates AMX support
+  cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
+  if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
+    // CPUID.D.0:EAX[17:18] indicates AMX enabled
+    cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+    if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
+      ret = 1;
+  }
+  return ret;
+#else
+  return 0;
+#endif
+}
+
 extern void openblas_warning(int verbose, const char * msg);
 #define FALLBACK_VERBOSE 1
 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
@@ -689,6 +731,8 @@ static gotoblas_t *get_coretype(void){
 	  }
 	}
 	if (model == 15){          // Sapphire Rapids
+	   if(support_amx_bf16())
+	     return &gotoblas_SAPPHIRERAPIDS;
 	   if(support_avx512_bf16())
              return &gotoblas_COOPERLAKE;
           if (support_avx512()) 
@@ -762,7 +806,8 @@ static gotoblas_t *get_coretype(void){
         }      
 	return NULL;
       }
-      case 0xf:
+      break;
+    case 0xf:
       if (model <= 0x2) return &gotoblas_NORTHWOOD;
       return &gotoblas_PRESCOTT;
     }
@@ -941,7 +986,8 @@ static char *corename[] = {
     "Excavator",
     "Zen",
     "SkylakeX",
-    "Cooperlake"
+    "Cooperlake",
+    "SapphireRapids"
 };
 
 char *gotoblas_corename(void) {
@@ -1006,6 +1052,7 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_ZEN)          return corename[23];
   if (gotoblas == &gotoblas_SKYLAKEX)     return corename[24];
   if (gotoblas == &gotoblas_COOPERLAKE)   return corename[25];
+  if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26];
   return corename[0];
 }
 
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 0f47b287c..6b21028d1 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023-2024 The OpenBLAS Project                          */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
 #else
 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
 #endif
+#ifdef DYN_ARMV8SVE
+extern gotoblas_t gotoblas_ARMV8SVE;
+#else
+#define gotoblas_ARMV8SVE gotoblas_ARMV8
+#endif
 #ifdef DYN_CORTEX_A55
 extern gotoblas_t  gotoblas_CORTEXA55;
 #else
@@ -116,10 +122,11 @@ extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
 #else
 extern gotoblas_t  gotoblas_CORTEXA53;
+#define gotoblas_CORTEXA55 gotoblas_CORTEXA53
 extern gotoblas_t  gotoblas_CORTEXA57;
-extern gotoblas_t  gotoblas_CORTEXA72;
-extern gotoblas_t  gotoblas_CORTEXA73;
-extern gotoblas_t  gotoblas_FALKOR;
+#define gotoblas_CORTEXA72 gotoblas_CORTEXA57
+#define gotoblas_CORTEXA73 gotoblas_CORTEXA57
+#define gotoblas_FALKOR gotoblas_CORTEXA57
 extern gotoblas_t  gotoblas_THUNDERX;
 extern gotoblas_t  gotoblas_THUNDERX2T99;
 extern gotoblas_t  gotoblas_TSV110;
@@ -128,17 +135,21 @@ extern gotoblas_t  gotoblas_NEOVERSEN1;
 #ifndef NO_SVE
 extern gotoblas_t  gotoblas_NEOVERSEV1;
 extern gotoblas_t  gotoblas_NEOVERSEN2;
+extern gotoblas_t  gotoblas_ARMV8SVE;
 #else
 #define gotoblas_NEOVERSEV1 gotoblas_ARMV8
 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
+#define gotoblas_ARMV8SVE   gotoblas_ARMV8
 #endif
 extern gotoblas_t  gotoblas_THUNDERX3T110;
-extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
+#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1
 
 extern void openblas_warning(int verbose, const char * msg);
+#define FALLBACK_VERBOSE 1
+#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
 
-#define NUM_CORETYPES   13
+#define NUM_CORETYPES   17
 
 /*
  * In case asm/hwcap.h is outdated on the build system, make sure
@@ -147,6 +158,9 @@ extern void openblas_warning(int verbose, const char * msg);
 #ifndef HWCAP_CPUID
 #define HWCAP_CPUID (1 << 11)
 #endif
+#ifndef HWCAP_SVE
+#define HWCAP_SVE (1 << 22)
+#endif
 
 #define get_cpu_ftr(id, var) ({					\
 		__asm__ __volatile__ ("mrs %0, "#id : "=r" (var));		\
@@ -165,9 +179,11 @@ static char *corename[] = {
   "emag8180",
   "neoversen1",
   "neoversev1",
+  "neoversev2",
   "neoversen2",
   "thunderx3t110",
   "cortexa55",
+  "armv8sve",
   "unknown"
 };
 
@@ -184,9 +200,11 @@ char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_EMAG8180)     return corename[ 9];
   if (gotoblas == &gotoblas_NEOVERSEN1)   return corename[10];
   if (gotoblas == &gotoblas_NEOVERSEV1)   return corename[11];
-  if (gotoblas == &gotoblas_NEOVERSEN2)   return corename[12];
-  if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
-  if (gotoblas == &gotoblas_CORTEXA55)    return corename[14];
+  if (gotoblas == &gotoblas_NEOVERSEV2)   return corename[12];
+  if (gotoblas == &gotoblas_NEOVERSEN2)   return corename[13];
+  if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14];
+  if (gotoblas == &gotoblas_CORTEXA55)    return corename[15];
+  if (gotoblas == &gotoblas_ARMV8SVE)     return corename[16];
   return corename[NUM_CORETYPES];
 }
 
@@ -218,9 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) {
     case  9: return (&gotoblas_EMAG8180);
     case 10: return (&gotoblas_NEOVERSEN1);
     case 11: return (&gotoblas_NEOVERSEV1);
-    case 12: return (&gotoblas_NEOVERSEN2);
-    case 13: return (&gotoblas_THUNDERX3T110);
-    case 14: return (&gotoblas_CORTEXA55);
+    case 12: return (&gotoblas_NEOVERSEV2);
+    case 13: return (&gotoblas_NEOVERSEN2);
+    case 14: return (&gotoblas_THUNDERX3T110);
+    case 15: return (&gotoblas_CORTEXA55);
+    case 16: return (&gotoblas_ARMV8SVE);
   }
   snprintf(message, 128, "Core not found: %s\n", coretype);
   openblas_warning(1, message);
@@ -231,6 +251,10 @@ static gotoblas_t *get_coretype(void) {
   int implementer, variant, part, arch, revision, midr_el1;
   char coremsg[128];
 
+#if defined (OS_DARWIN)
+  return &gotoblas_NEOVERSEN1;
+#endif
+	
 #if (!defined OS_LINUX && !defined OS_ANDROID)
   return NULL;
 #else
@@ -281,9 +305,24 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_NEOVERSEN1;
 #ifndef NO_SVE
         case 0xd49:
-          return &gotoblas_NEOVERSEN2;
+	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+	    openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);  
+	    return &gotoblas_NEOVERSEN1;
+	  } else
+            return &gotoblas_NEOVERSEN2;
 	case 0xd40:
-	  return &gotoblas_NEOVERSEV1;
+	  if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+		  openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
+	    return &gotoblas_NEOVERSEN1;
+      	  }else
+	    return &gotoblas_NEOVERSEV1;
+  case 0xd4f:
+      if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+        openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
+        return &gotoblas_NEOVERSEN1;
+      } else {
+	      return &gotoblas_NEOVERSEV2;
+      }
 #endif
 	case 0xd05: // Cortex A55
 	  return &gotoblas_CORTEXA55;
@@ -328,10 +367,19 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_FALKOR;
       }
       break;
+    case 0x61: // Apple
+	return &gotoblas_NEOVERSEN1;
+      break;
     default:
       snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
       openblas_warning(1, coremsg);
   }
+#ifndef NO_SVE
+  if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
+    return &gotoblas_ARMV8SVE;
+  }
+#endif
+
   return NULL;
 #endif
 }
diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c
index 52f8bcb2f..44de59669 100644
--- a/driver/others/dynamic_loongarch64.c
+++ b/driver/others/dynamic_loongarch64.c
@@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 
+#include <sys/auxv.h>
 #include "common.h"
 
 extern gotoblas_t  gotoblas_LOONGSON3R5;
@@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) {
   return NULL;
 }
 
-#define LASX_MASK       1<<7
-#define LSX_MASK        1<<6
-#define LOONGARCH_CFG2  0x02
+#define LA_HWCAP_LSX    (1U << 4)
+#define LA_HWCAP_LASX   (1U << 5)
 
 static gotoblas_t *get_coretype(void) {
-  int ret = 0;
-  __asm__ volatile (
-    "cpucfg %0, %1 \n\t"
-    : "+&r"(ret)
-    : "r"(LOONGARCH_CFG2)
-  );
-
-  if (ret & LASX_MASK)
+  int hwcap = (int)getauxval(AT_HWCAP);
+
+  if (hwcap & LA_HWCAP_LASX)
     return &gotoblas_LOONGSON3R5;
-  else if (ret & LSX_MASK)
+  else if (hwcap & LA_HWCAP_LSX)
     return &gotoblas_LOONGSON2K1000;
   else
     return &gotoblas_LOONGSONGENERIC;
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 2847ea9ae..4c1f4a26e 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -3,7 +3,7 @@
 
 extern gotoblas_t gotoblas_POWER6;
 extern gotoblas_t gotoblas_POWER8;
-#if (!defined __GNUC__) || ( __GNUC__ >= 6)
+#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__)
 extern gotoblas_t gotoblas_POWER9;
 #endif
 #ifdef HAVE_P10_SUPPORT
@@ -20,14 +20,14 @@ static char *corename[] = {
 	"POWER10"
 };
 
-#define NUM_CORETYPES 4
+#define NUM_CORETYPES 5
 
 char *gotoblas_corename(void) {
 #ifndef C_PGI
 	if (gotoblas == &gotoblas_POWER6)	return corename[1];
 #endif
 	if (gotoblas == &gotoblas_POWER8)	return corename[2];
-#if (!defined __GNUC__) || ( __GNUC__ >= 6)
+#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__)
 	if (gotoblas == &gotoblas_POWER9)	return corename[3];
 #endif
 #ifdef HAVE_P10_SUPPORT
@@ -36,14 +36,44 @@ char *gotoblas_corename(void) {
 	return corename[0];
 }
 
-#if defined(__clang__)
-static int __builtin_cpu_supports(char* arg) 
-{
-	return 0;
-}
+#define CPU_UNKNOWN  0
+#define CPU_POWER5   5
+#define CPU_POWER6   6
+#define CPU_POWER8   8
+#define CPU_POWER9   9
+#define CPU_POWER10 10
+
+#ifndef POWER_9
+#define POWER_9         0x20000         /* 9 class CPU */
+#endif
+#ifndef POWER_10
+#define POWER_10        0x40000         /* 10 class CPU */
 #endif
 
-#if defined(C_PGI) || defined(__clang__)
+#ifdef _AIX
+#include <sys/systemcfg.h>
+
+static int cpuid(void)
+{
+    int arch = _system_configuration.implementation;
+#ifdef POWER_6
+    if (arch == POWER_6) return CPU_POWER6;
+#endif
+#ifdef POWER_7
+    else if (arch == POWER_7) return CPU_POWER6;
+#endif
+#ifdef POWER_8
+    else if (arch == POWER_8) return CPU_POWER8;
+#endif
+#ifdef POWER_9
+    else if (arch == POWER_9) return CPU_POWER9;
+#endif
+#ifdef POWER_10
+    else if (arch >= POWER_10) return CPU_POWER10;
+#endif
+    return CPU_UNKNOWN;
+}
+#elif defined(C_PGI) || defined(__clang__)
 /*
  * NV HPC compilers do not yet implement __builtin_cpu_is().
  * Fake a version here for use in the CPU detection code below.
@@ -53,21 +83,12 @@ static int __builtin_cpu_supports(char* arg)
  * what was requested.
  */
 
-#include <string.h>
-
 /*
  *  Define POWER processor version table.
  *
  *  NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
  */
 
-#define CPU_UNKNOWN 0
-#define CPU_POWER5 5
-#define CPU_POWER6 6
-#define CPU_POWER8 8
-#define CPU_POWER9 9
-#define CPU_POWER10 10
-
 static  struct {
     uint32_t    pvr_mask;
     uint32_t    pvr_value;
@@ -160,7 +181,8 @@ static  struct {
     },
 };
 
-static int __builtin_cpu_is(const char *cpu) {
+static int cpuid(void)
+{
 	int i;
 	uint32_t pvr;
 	uint32_t cpu_type;
@@ -178,15 +200,54 @@ static int __builtin_cpu_is(const char *cpu) {
 		pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
 #endif
 	cpu_type = pvrPOWER[i].cpu_type;
+	return (int)(cpu_type);
+}
+#elif !defined(__BUILTIN_CPU_SUPPORTS__)
+static int cpuid(void)
+{
+    return CPU_UNKNOWN;
+}
+#endif  /* _AIX */
+
+#ifndef __BUILTIN_CPU_SUPPORTS__
+#include <string.h>
+
+#ifndef __has_builtin
+#define __has_builtin(x)   0
+#endif
 
-	if (!strcmp(cpu, "power8"))
-		return cpu_type == CPU_POWER8;
-	if (!strcmp(cpu, "power9"))
-		return cpu_type == CPU_POWER9;
-	return 0;
+#if defined(_AIX) || !__has_builtin(__builtin_cpu_is)
+static int __builtin_cpu_is(const char *arg)
+{
+    static int ipinfo = -1;
+    if (ipinfo < 0) {
+        ipinfo = cpuid();
+    }
+#ifdef HAVE_P10_SUPPORT
+    if (ipinfo == CPU_POWER10) {
+        if (!strcmp(arg, "power10")) return 1;
+    }
+#endif
+    if (ipinfo == CPU_POWER9) {
+        if (!strcmp(arg, "power9")) return 1;
+    } else if (ipinfo == CPU_POWER8) {
+        if (!strcmp(arg, "power8")) return 1;
+#ifndef C_PGI
+    } else if (ipinfo == CPU_POWER6) {
+        if (!strcmp(arg, "power6")) return 1;
+#endif
+    }
+    return 0;
 }
+#endif
 
-#endif  /* C_PGI */
+#if defined(_AIX) || !__has_builtin(__builtin_cpu_supports)
+static int __builtin_cpu_supports(const char *arg)
+{
+    return 0;
+}
+#endif
+#endif
 
 static gotoblas_t *get_coretype(void) {
 
@@ -196,19 +257,23 @@ static gotoblas_t *get_coretype(void) {
 #endif
 	if (__builtin_cpu_is("power8"))
 		return &gotoblas_POWER8;
-#if (!defined __GNUC__) || ( __GNUC__ >= 6)
+#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__)
 	if (__builtin_cpu_is("power9"))
 		return &gotoblas_POWER9;
 #endif
 #ifdef HAVE_P10_SUPPORT
+#if defined(_AIX) || defined(__clang__)
+	if (__builtin_cpu_is("power10"))
+#else
 	if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
+#endif
 		return &gotoblas_POWER10;
 #endif
 	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
 #if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
 	if (__builtin_cpu_is("power10"))
 		return &gotoblas_POWER9;
-#endif	
+#endif
 	return NULL;
 }
 
@@ -233,7 +298,7 @@ static gotoblas_t *force_coretype(char * coretype) {
 	case  1: return (&gotoblas_POWER6);
 #endif
 	case  2: return (&gotoblas_POWER8);
-#if (!defined __GNUC__) || ( __GNUC__ >= 6)
+#if ((!defined __GNUC__) || ( __GNUC__ >= 6)) || defined(__clang__)
 	case  3: return (&gotoblas_POWER9);
 #endif
 #ifdef HAVE_P10_SUPPORT
@@ -274,6 +339,9 @@ void gotoblas_dynamic_init(void) {
 	if (gotoblas && gotoblas -> init) {
 		strncpy(coren,gotoblas_corename(),20);
 		sprintf(coremsg, "Core: %s\n",coren);
+		if (getenv("GET_OPENBLAS_CORETYPE")) {
+			fprintf(stderr, "%s", coremsg);
+		}
 		openblas_warning(2, coremsg);
 		gotoblas -> init();
 	} else {
diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c
index 5b45aae2f..dd26c8e80 100644
--- a/driver/others/dynamic_zarch.c
+++ b/driver/others/dynamic_zarch.c
@@ -13,7 +13,7 @@ extern gotoblas_t gotoblas_Z14;
 
 #define NUM_CORETYPES 4
 
-extern int openblas_verbose();
+extern int openblas_verbose(void);
 extern void openblas_warning(int verbose, const char* msg);
 
 char* gotoblas_corename(void) {
diff --git a/driver/others/memory.c b/driver/others/memory.c
index 4493b7d71..4ee8f9a2e 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -73,6 +73,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+#define NEW_BUFFERS 512
 #ifndef likely
 #ifdef __GNUC__
 #define likely(x) __builtin_expect(!!(x), 1)
@@ -422,15 +423,13 @@ This value is equal or large than blas_cpu_number. This means some threads are s
 */
 int blas_num_threads = 0;
 
-int blas_num_threads_set = 0;
-
 int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
 
-static void blas_memory_init();
+static void blas_memory_init(void);
 
-void openblas_fork_handler()
+void openblas_fork_handler(void)
 {
   // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
   // built with "make USE_OPENMP=0".
@@ -447,9 +446,9 @@ void openblas_fork_handler()
 #endif
 }
 
-extern int openblas_num_threads_env();
-extern int openblas_goto_num_threads_env();
-extern int openblas_omp_num_threads_env();
+extern int openblas_num_threads_env(void);
+extern int openblas_goto_num_threads_env(void);
+extern int openblas_omp_num_threads_env(void);
 
 int blas_get_cpu_number(void){
 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
@@ -593,7 +592,7 @@ static BLASULONG  key_lock = 0UL;
 #endif
 
 /* Returns a pointer to the start of the per-thread memory allocation data */
-static __inline struct alloc_t ** get_memory_table() {
+static __inline struct alloc_t ** get_memory_table(void) {
 #if defined(SMP)
 LOCK_COMMAND(&key_lock);
 lsk=local_storage_key;
@@ -1146,7 +1145,7 @@ static void blas_memory_cleanup(void* ptr){
   }
 }
 
-static void blas_memory_init(){
+static void blas_memory_init(void){
 #if defined(SMP)
 #  if defined(OS_WINDOWS)
   local_storage_key = TlsAlloc();
@@ -1503,7 +1502,7 @@ static void gotoblas_memory_init(void) {
 /* Initialization for all function; this function should be called before main */
 
 static int gotoblas_initialized = 0;
-extern void openblas_read_env();
+extern void openblas_read_env(void);
 
 void CONSTRUCTOR gotoblas_init(void) {
 
@@ -1996,13 +1995,11 @@ This value is equal or large than blas_cpu_number. This means some threads are s
 */
 int blas_num_threads = 0;
 
-int blas_num_threads_set = 0;
-
 int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
 
-void openblas_fork_handler()
+void openblas_fork_handler(void)
 {
   // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
   // built with "make USE_OPENMP=0".
@@ -2019,9 +2016,9 @@ void openblas_fork_handler()
 #endif
 }
 
-extern int openblas_num_threads_env();
-extern int openblas_goto_num_threads_env();
-extern int openblas_omp_num_threads_env();
+extern int openblas_num_threads_env(void);
+extern int openblas_goto_num_threads_env(void);
+extern int openblas_omp_num_threads_env(void);
 
 int blas_get_cpu_number(void){
 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
@@ -2901,7 +2898,7 @@ void *blas_memory_alloc(int procpos){
 #endif
       position ++;
 
-    } while (position < 512+NUM_BUFFERS);
+    } while (position < NEW_BUFFERS + NUM_BUFFERS);
   }
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
@@ -3015,10 +3012,13 @@ void *blas_memory_alloc(int procpos){
 #endif
  if (memory_overflowed) goto terminate;
   fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
+  fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
+  fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", MAX_CPU_NUMBER);
   memory_overflowed=1;
-  new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t));
-  newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct));
-  for (i = 0; i < 512; i++) {
+  MB;
+  new_release_info = (struct release_t*) malloc(NEW_BUFFERS * sizeof(struct release_t));
+  newmemory = (struct newmemstruct*) malloc(NEW_BUFFERS * sizeof(struct newmemstruct));
+  for (i = 0; i < NEW_BUFFERS; i++) {
   newmemory[i].addr   = (void *)0;
 #if defined(WHEREAMI) && !defined(USE_OPENMP)
   newmemory[i].pos    = -1;
@@ -3131,12 +3131,12 @@ void blas_memory_free(void *free_area){
   printf("  Position : %d\n", position);
 #endif
   if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) {
-    while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area))
+    while ((position < NUM_BUFFERS+NEW_BUFFERS) && (newmemory[position-NUM_BUFFERS].addr != free_area))
       position++;
   // arm: ensure all writes are finished before other thread takes this memory
   WMB;
-
-  newmemory[position].used = 0;
+if (position - NUM_BUFFERS >= NEW_BUFFERS) goto error;
+  newmemory[position-NUM_BUFFERS].used = 0;
 #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
   UNLOCK_COMMAND(&alloc_lock);
 #endif
@@ -3214,14 +3214,18 @@ void blas_shutdown(void){
 #endif
     memory[pos].lock   = 0;
   }
-  if (memory_overflowed)
-    for (pos = 0; pos < 512; pos ++){
+  if (memory_overflowed) {
+    for (pos = 0; pos < NEW_BUFFERS; pos ++){
       newmemory[pos].addr   = (void *)0;
       newmemory[pos].used   = 0;
 #if defined(WHEREAMI) && !defined(USE_OPENMP)
       newmemory[pos].pos    = -1;
 #endif
       newmemory[pos].lock   = 0;
+    }
+    free(newmemory);
+    newmemory = NULL;
+    memory_overflowed = 0;  
   }
 
   UNLOCK_COMMAND(&alloc_lock);
@@ -3339,7 +3343,7 @@ static void gotoblas_memory_init(void) {
 /* Initialization for all function; this function should be called before main */
 
 static int gotoblas_initialized = 0;
-extern void openblas_read_env();
+extern void openblas_read_env(void);
 
 void CONSTRUCTOR gotoblas_init(void) {
 
diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c
index 0b38d1887..a2593e01f 100644
--- a/driver/others/memory_qalloc.c
+++ b/driver/others/memory_qalloc.c
@@ -283,13 +283,12 @@ The numbers of threads in the thread pool.
 This value is equal or large than blas_cpu_number. This means some threads are sleep.
 */
 int blas_num_threads = 0;
-int blas_num_threads_set = 0;
 
 int  goto_get_num_procs  (void) {
   return blas_cpu_number;
 }
 
-void openblas_fork_handler()
+void openblas_fork_handler(void)
 {
   // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
   // built with "make USE_OPENMP=0".
@@ -306,9 +305,9 @@ void openblas_fork_handler()
 #endif
 }
 
-extern int openblas_num_threads_env();
-extern int openblas_goto_num_threads_env();
-extern int openblas_omp_num_threads_env();
+extern int openblas_num_threads_env(void);
+extern int openblas_goto_num_threads_env(void);
+extern int openblas_omp_num_threads_env(void);
 
 int blas_get_cpu_number(void){
 #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
diff --git a/driver/others/openblas_env.c b/driver/others/openblas_env.c
index 35b2270d4..c65f0f320 100644
--- a/driver/others/openblas_env.c
+++ b/driver/others/openblas_env.c
@@ -41,15 +41,15 @@ static int openblas_env_goto_num_threads=0;
 static int openblas_env_omp_num_threads=0;
 static int openblas_env_omp_adaptive=0;
 
-int openblas_verbose() { return openblas_env_verbose;}
-unsigned int openblas_thread_timeout() { return openblas_env_thread_timeout;}
-int openblas_block_factor() { return openblas_env_block_factor;}
-int openblas_num_threads_env() { return openblas_env_openblas_num_threads;}
-int openblas_goto_num_threads_env() { return openblas_env_goto_num_threads;}
-int openblas_omp_num_threads_env() { return openblas_env_omp_num_threads;}
-int openblas_omp_adaptive_env() { return openblas_env_omp_adaptive;}
-
-void openblas_read_env() {
+int openblas_verbose(void) { return openblas_env_verbose;}
+unsigned int openblas_thread_timeout(void) { return openblas_env_thread_timeout;}
+int openblas_block_factor(void) { return openblas_env_block_factor;}
+int openblas_num_threads_env(void) { return openblas_env_openblas_num_threads;}
+int openblas_goto_num_threads_env(void) { return openblas_env_goto_num_threads;}
+int openblas_omp_num_threads_env(void) { return openblas_env_omp_num_threads;}
+int openblas_omp_adaptive_env(void) { return openblas_env_omp_adaptive;}
+
+void openblas_read_env(void) {
   int ret=0;
   env_var_t p;
   if (readenv(p,"OPENBLAS_VERBOSE")) ret = atoi(p);
diff --git a/driver/others/openblas_error_handle.c b/driver/others/openblas_error_handle.c
index 9ac72c15d..aa0aa776a 100644
--- a/driver/others/openblas_error_handle.c
+++ b/driver/others/openblas_error_handle.c
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-extern int openblas_verbose();
+extern int openblas_verbose(void);
 
 void openblas_warning(int verbose, const char * msg) {
   int current_verbose;
diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c
index 7a5cbeb62..867d0e361 100644
--- a/driver/others/openblas_get_config.c
+++ b/driver/others/openblas_get_config.c
@@ -69,13 +69,13 @@ static char* openblas_config_str=""
   ;
 
 #ifdef DYNAMIC_ARCH
-char *gotoblas_corename();
+char *gotoblas_corename(void);
 #endif
 
 static char tmp_config_str[256];
-int openblas_get_parallel();
+int openblas_get_parallel(void);
 
-char* CNAME() {
+char* CNAME(void) {
 char tmpstr[20];
   strcpy(tmp_config_str, openblas_config_str);
 #ifdef DYNAMIC_ARCH
@@ -90,7 +90,7 @@ char tmpstr[20];
 }
 
 
-char* openblas_get_corename() {
+char* openblas_get_corename(void) {
 #ifndef DYNAMIC_ARCH 
   return CHAR_CORENAME;
 #else
diff --git a/driver/others/openblas_get_parallel.c b/driver/others/openblas_get_parallel.c
index 5dfda6e59..becfa0a3a 100644
--- a/driver/others/openblas_get_parallel.c
+++ b/driver/others/openblas_get_parallel.c
@@ -42,17 +42,17 @@ static int parallel = 0;
 
 
 #ifdef NEEDBUNDERSCORE
-int CNAME() {
+int CNAME(void) {
   return parallel;
 }
 
-int NAME() {
+int NAME(void) {
   return parallel;
 }
 
 #else
 //The CNAME and NAME are the same.
-int NAME() {
+int NAME(void) {
   return parallel;
 }
 #endif
diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c
index 0b57867b0..c938fde69 100644
--- a/driver/others/openblas_set_num_threads.c
+++ b/driver/others/openblas_set_num_threads.c
@@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef SMP_SERVER
 
 extern  void openblas_set_num_threads(int num_threads) ;
+extern  int openblas_get_num_threads(void) ;
 
 void openblas_set_num_threads_(int* num_threads){
 	openblas_set_num_threads(*num_threads);
 }
 
+int openblas_set_num_threads_local(int num_threads){
+	int ret = openblas_get_num_threads();
+	openblas_set_num_threads(num_threads);
+	blas_omp_threads_local=num_threads;
+	return ret;
+}
+
+
 #else
 //Single thread
 
@@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) {
 void openblas_set_num_threads_(int* num_threads){
 
 }
+
+int openblas_set_num_threads_local(int num_threads){
+	return 1;
+}
 #endif
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index 0d5c6aec0..de6bf0de4 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -40,7 +40,7 @@
 #include <string.h>
 #include "common.h"
 
-extern int openblas_block_factor();
+extern int openblas_block_factor(void);
 int get_L2_size(void);
 
 #define DEFAULT_GEMM_P 128
diff --git a/exports/Makefile b/exports/Makefile
index d81735342..27a291f34 100644
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -73,6 +73,10 @@ endif
 endif
 endif
 
+ifeq ($(F_COMPILER)$(OSNAME), IBMAIX)
+EXTRALIB += -lxlf90
+endif
+
 ifeq ($(C_COMPILER), PGI)
 EXTRALIB += -pgf90libs
 endif
@@ -132,8 +136,12 @@ libgoto_hpl.def : $(GENSYM)
 	./$(GENSYM) win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 ifeq ($(OSNAME), Darwin)
+ifeq ($(FIXED_LIBNAME),1)
+INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).dylib
+else
 INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
 endif
+endif
 
 ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
 $(LIBDYNNAME) : ../$(LIBNAME) osx.def
@@ -169,8 +177,12 @@ INTERNALNAME = $(LIBPREFIX).so
 FEXTRALIB += -lm
 EXTRALIB += -lm
 else
+ifeq ($(FIXED_LIBNAME),1)
+INTERNALNAME = $(LIBPREFIX)$(LIBNAMESUFFIX).so
+else
 INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
 endif
+endif
 
 ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
 ../$(LIBSONAME) : ../$(LIBNAME) linktest.c
@@ -248,6 +260,20 @@ endif
 
 ifeq ($(OSNAME), AIX)
 
+so : ../$(LIBSONAME) linktest.c
+	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK.
+	rm -f linktest
+
+../$(LIBSONAME) : aix.exp
+	$(CC) $(CFLAGS) $(LDFLAGS)  -shared -o ../$(LIBSONAME) \
+	-Wl,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB)
+
+aix.exp :
+	/usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \
+	|| ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \
+	{ if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \
+	/usr/bin/sort -u > aix.exp
+
 ifeq ($(COMPILER_F77), xlf)
 
 goto32.$(SUFFIX) : ../$(LIBNAME) aix.def
@@ -272,23 +298,28 @@ static : ../$(LIBNAME)
 	rm -f goto.$(SUFFIX)
 
 osx.def : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
+	./$(GENSYM) osx $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 aix.def : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
+	./$(GENSYM) aix $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 objcopy.def : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
+	./$(GENSYM) objcopy $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 objconv.def : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
+	./$(GENSYM) objconv $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS)  $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > $(@F)
 
 test : linktest.c
 	$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
 	rm -f linktest
 
 linktest.c : $(GENSYM) ../Makefile.system ../getarch.c
-	./$(GENSYM) linktest  $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
+	./$(GENSYM) linktest  $(ARCH) "$(BU)" $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
+ifeq ($(F_COMPILER), IBM)
+	mv linktest.c linktest.c.FIRST
+	egrep -v 'second_|dsecnd_' linktest.c.FIRST > linktest.c
+	rm linktest.c.FIRST
+endif
 
 clean ::
 	@rm -f *.def *.dylib __.SYMDEF* *.renamed
diff --git a/exports/gensymbol b/exports/gensymbol
index f05de626f..226035842 100755
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -21,7 +21,7 @@ blasobjsc="
     chbmv chemm chemv cher2 cher2k cher cherk scabs1 scamax
     chpmv chpr2 chpr crotg cscal csrot csscal cswap scamin scasum scnrm2
     csymm csyr2k csyrk ctbmv ctbsv ctpmv ctpsv ctrmm ctrmv ctrsm
-    ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum"
+    ctrsv icamax icamin cimatcopy comatcopy cgeadd scsum cgemmt"
 
 blasobjsd="
     damax damin dasum daxpy daxpby dcabs1 dcopy ddot dgbmv dgemm
@@ -29,7 +29,7 @@ blasobjsd="
     dscal dsdot dspmv dspr2 dimatcopy domatcopy
     dspr dswap dsymm dsymv dsyr2 dsyr2k dsyr dsyrk dtbmv dtbsv
     dtpmv dtpsv dtrmm dtrmv dtrsm dtrsv
-        idamax idamin idmax idmin dgeadd dsum"
+        idamax idamin idmax idmin dgeadd dsum dgemmt"
 
 blasobjss="
     isamax isamin ismax ismin
@@ -38,7 +38,7 @@ blasobjss="
     smax smin snrm2 simatcopy somatcopy
     srot srotg srotm srotmg ssbmv sscal sspmv sspr2 sspr sswap
     ssymm ssymv ssyr2 ssyr2k ssyr ssyrk stbmv stbsv stpmv stpsv
-    strmm strmv strsm strsv  sgeadd ssum"
+    strmm strmv strsm strsv  sgeadd ssum sgemmt"
 
 blasobjsz="
     izamax izamin
@@ -48,7 +48,7 @@ blasobjsz="
     zhpr zrotg zscal zswap zsymm zsyr2k zsyrk ztbmv
     ztbsv ztpmv ztpsv ztrmm ztrmv ztrsm ztrsv
     zomatcopy  zimatcopy dzamax dzamin dzasum dznrm2
-    zgeadd  dzsum"
+    zgeadd  dzsum zgemmt"
 
 blasobjs="lsame xerbla"
 bfblasobjs="sbgemm sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
@@ -58,8 +58,9 @@ cblasobjsc="
     cblas_cher cblas_cherk  cblas_chpmv cblas_chpr2 cblas_chpr cblas_cscal cblas_caxpby
     cblas_csscal cblas_cswap cblas_csymm cblas_csyr2k cblas_csyrk cblas_ctbmv cblas_cgeadd
     cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
-    cblas_scnrm2 cblas_scasum
+    cblas_scnrm2 cblas_scasum cblas_cgemmt
     cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
+    cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin
     "
 cblasobjsd="
     cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
@@ -67,8 +68,9 @@ cblasobjsd="
     cblas_drot cblas_drotg cblas_drotm cblas_drotmg cblas_dsbmv cblas_dscal cblas_dsdot
     cblas_dspmv cblas_dspr2 cblas_dspr cblas_dswap cblas_dsymm cblas_dsymv cblas_dsyr2
     cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
-    cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd
+    cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
     cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
+    cblas_damax  cblas_damin
     "
 
 cblasobjss="
@@ -78,8 +80,9 @@ cblasobjss="
     cblas_srotm cblas_srotmg cblas_ssbmv cblas_sscal cblas_sspmv cblas_sspr2 cblas_sspr
     cblas_sswap cblas_ssymm cblas_ssymv cblas_ssyr2 cblas_ssyr2k cblas_ssyr cblas_ssyrk
     cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
-    cblas_strsv cblas_sgeadd
+    cblas_strsv cblas_sgeadd cblas_sgemmt
     cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
+    cblas_samax cblas_samin
     "
 
 cblasobjsz="
@@ -89,8 +92,9 @@ cblasobjsz="
     cblas_zhpr cblas_zscal cblas_zswap cblas_zsymm cblas_zsyr2k cblas_zsyrk
     cblas_ztbmv cblas_ztbsv cblas_ztpmv cblas_ztpsv cblas_ztrmm cblas_ztrmv cblas_ztrsm
     cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
-    cblas_zaxpby cblas_zgeadd
+    cblas_zaxpby cblas_zgeadd cblas_zgemmt
     cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
+    cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin
 "
 
 cblasobjs="cblas_xerbla"
@@ -716,6 +720,7 @@ lapackobjs2z="$lapackobjs2z
 # functions added for lapack-3.7.0
 lapackobjs2s="$lapackobjs2s
     slarfy
+    ssyconvf
     strevc3
     sgelqt
     sgelqt3
@@ -843,6 +848,70 @@ lapackobjs2z="$lapackobjs2z
     zungtsqr_row
     "
 
+#functions added for lapack-3.11
+lapackobjs2c="$lapackobjs2c
+    cgedmd
+    cgedmdq
+    "
+lapackobjs2d="$lapackobjs2d
+    dgedmd
+    dgedmdq
+    "
+lapackobjs2s="$lapackobjs2s
+    sgedmd
+    sgedmdq
+    "
+lapackobjs2z="$lapackobjs2z
+    zgedmd
+    zgedmdq
+    "
+
+#functions added post 3.11
+
+lapackobjs2c="$lapackobjs2c
+    claqp2rk
+    claqp3rk
+    ctrsyl3
+    "
+#    claqz0
+#    claqz1
+#    claqz2
+#    claqz3
+#    clatrs3
+
+lapackobjs2d="$lapackobjs2d
+    dgelqs
+    dgelst
+    dgeqp3rk
+    dgeqrs
+    dlaqp2rk
+    dlaqp3rk
+    dlarmm
+    dlatrs3
+    dtrsyl3
+    "
+#    dlaqz0
+#    dlaqz1
+#    dlaqz2
+#    dlaqz3
+#    dlaqz4
+
+lapackobjs2z="$lapackobjs2z
+    zgelqs
+    zgelst
+    zgeqp3rk
+    zgeqrs
+    zlaqp2rk
+    zlaqp3rk
+    zlatrs3
+    zrscl
+    ztrsyl3
+    "
+#    zlaqz0
+#    zlaqz1
+#    zlaqz2
+#    zlaqz3
+
 lapack_extendedprecision_objs="
     zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
     dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@@ -1012,6 +1081,10 @@ lapackeobjsc="
     LAPACKE_cgebrd_work
     LAPACKE_cgecon
     LAPACKE_cgecon_work
+    LAPACKE_cgedmd
+    LAPACKE_cgedmd_work
+    LAPACKE_cgedmdq
+    LAPACKE_cgedmdq_work
     LAPACKE_cgeequ
     LAPACKE_cgeequ_work
     LAPACKE_cgeequb
@@ -1600,6 +1673,14 @@ lapackeobjsc="
     LAPACKE_cgetsqrhrt_work
     LAPACKE_cungtsqr_row
     LAPACKE_cungtsqr_row_work
+    LAPACKE_clangb
+    LAPACKE_clangb_work
+    LAPACKE_ctrsyl3
+    LAPACKE_ctrsyl3_work
+    LAPACKE_ctz_nancheck
+    LAPACKE_ctz_trans
+    LAPACKE_cunhr_col
+    LAPACKE_cunhr_col_work
 "
 
 lapackeobjsd="
@@ -1671,6 +1752,10 @@ lapackeobjsd="
     LAPACKE_dgebrd_work
     LAPACKE_dgecon
     LAPACKE_dgecon_work
+    LAPACKE_dgedmd
+    LAPACKE_dgedmd_work
+    LAPACKE_dgedmdq
+    LAPACKE_dgedmdq_work
     LAPACKE_dgeequ
     LAPACKE_dgeequ_work
     LAPACKE_dgeequb
@@ -2213,6 +2298,14 @@ lapackeobjsd="
     LAPACKE_dgetsqrhrt_work
     LAPACKE_dorgtsqr_row
     LAPACKE_dorgtsqr_row_work
+    LAPACKE_dlangb
+    LAPACKE_dlangb_work
+    LAPACKE_dorhr_col
+    LAPACKE_dorhr_col_work
+    LAPACKE_dtrsyl3
+    LAPACKE_dtrsyl3_work
+    LAPACKE_dtz_nancheck
+    LAPACKE_dtz_trans
 "
 
 lapackeobjss="
@@ -2284,6 +2377,10 @@ lapackeobjss="
     LAPACKE_sgebrd_work
     LAPACKE_sgecon
     LAPACKE_sgecon_work
+    LAPACKE_sgedmd
+    LAPACKE_sgedmd_work
+    LAPACKE_sgedmdq
+    LAPACKE_sgedmdq_work
     LAPACKE_sgeequ
     LAPACKE_sgeequ_work
     LAPACKE_sgeequb
@@ -2818,6 +2915,14 @@ lapackeobjss="
     LAPACKE_sgetsqrhrt_work
     LAPACKE_sorgtsqr_row
     LAPACKE_sorgtsqr_row_work
+    LAPACKE_slangb
+    LAPACKE_slangb_work
+    LAPACKE_sorhr_col
+    LAPACKE_sorhr_col_work
+    LAPACKE_strsyl3
+    LAPACKE_strsyl3_work
+    LAPACKE_stz_nancheck
+    LAPACKE_stz_trans
 "
 
 lapackeobjsz="
@@ -2893,6 +2998,10 @@ lapackeobjsz="
     LAPACKE_zgebrd_work
     LAPACKE_zgecon
     LAPACKE_zgecon_work
+    LAPACKE_zgedmd
+    LAPACKE_zgedmd_work
+    LAPACKE_zgedmdq
+    LAPACKE_zgedmdq_work
     LAPACKE_zgeequ
     LAPACKE_zgeequ_work
     LAPACKE_zgeequb
@@ -3481,6 +3590,14 @@ lapackeobjsz="
     LAPACKE_zgetsqrhrt_work
     LAPACKE_zungtsqr_row
     LAPACKE_zungtsqr_row_work
+    LAPACKE_zlangb
+    LAPACKE_zlangb_work
+    LAPACKE_ztrsyl3
+    LAPACKE_ztrsyl3_work
+    LAPACKE_ztz_nancheck
+    LAPACKE_ztz_trans
+    LAPACKE_zunhr_col
+    LAPACKE_zunhr_col_work
 "
 ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
 ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
@@ -3582,6 +3699,7 @@ lapack_embeded_underscore_objs_s="
     ssysv_aa_2stage ssytrf_aa_2stage
     ssytrs_aa_2stage
     slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
+    slarfb_gett
 "
 lapack_embeded_underscore_objs_c="
     chetf2_rook chetrf_rook chetri_rook
@@ -3607,6 +3725,7 @@ lapack_embeded_underscore_objs_c="
     csysv_aa_2stage csytrf_aa_2stage
     csytrs_aa_2stage
     claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
+    clarfb_gett
 "
 lapack_embeded_underscore_objs_d="
     dlasyf_rook
@@ -3624,6 +3743,7 @@ lapack_embeded_underscore_objs_d="
      dsysv_aa_2stage
     dsytrf_aa_2stage dsytrs_aa_2stage
     dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
+    dlarfb_gett
 "
 lapack_embeded_underscore_objs_z="
     zhetf2_rook zhetrf_rook zhetri_rook
@@ -3648,6 +3768,7 @@ lapack_embeded_underscore_objs_z="
     zhetrs_aa_2stage zsysv_aa_2stage
     zsytrf_aa_2stage zsytrs_aa_2stage
     zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
+    zlarfb_gett
 "
 
 dirname=`pwd -P`/../lapack-netlib
diff --git a/exports/gensymbol.pl b/exports/gensymbol.pl
index e38a3cc89..dd79e924d 100644
--- a/exports/gensymbol.pl
+++ b/exports/gensymbol.pl
@@ -21,7 +21,7 @@
     chbmv,chemm,chemv,cher2,cher2k,cher,cherk,scabs1,scamax,
     chpmv,chpr2,chpr,crotg,cscal,csrot,csscal,cswap,scamin,scasum,scnrm2,
     csymm,csyr2k,csyrk,ctbmv,ctbsv,ctpmv,ctpsv,ctrmm,ctrmv,ctrsm,
-    ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum);
+    ctrsv,icamax,icamin,cimatcopy,comatcopy,cgeadd,scsum,cgemmt);
     
 @blasobjsd = (
     damax,damin,dasum,daxpy,daxpby,dcabs1,dcopy,ddot,dgbmv,dgemm,
@@ -29,7 +29,7 @@
     dscal,dsdot,dspmv,dspr2,dimatcopy,domatcopy,
     dspr,dswap,dsymm,dsymv,dsyr2,dsyr2k,dsyr,dsyrk,dtbmv,dtbsv,
     dtpmv,dtpsv,dtrmm,dtrmv,dtrsm,dtrsv,
-        idamax,idamin,idmax,idmin,dgeadd,dsum);
+        idamax,idamin,idmax,idmin,dgeadd,dsum,dgemmt);
     
 @blasobjss = (
     isamax,isamin,ismax,ismin,
@@ -38,7 +38,7 @@
     smax,smin,snrm2,simatcopy,somatcopy,
     srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap,
     ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv,
-    strmm,strmv,strsm,strsv, sgeadd,ssum);
+    strmm,strmv,strsm,strsv, sgeadd,ssum,sgemmt);
      
 @blasobjsz = (
     izamax,izamin,,
@@ -48,7 +48,7 @@
     zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv,
     ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv,
     zomatcopy, zimatcopy,dzamax,dzamin,dzasum,dznrm2,
-    zgeadd, dzsum);
+    zgeadd, dzsum, zgemmt);
 
 @blasobjs = (lsame, xerbla);
 @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@@ -60,7 +60,7 @@
     cblas_ctbsv, cblas_ctpmv, cblas_ctpsv, cblas_ctrmm, cblas_ctrmv, cblas_ctrsm, cblas_ctrsv, 
     cblas_scnrm2, cblas_scasum,
     cblas_icamax, cblas_icamin, cblas_icmin, cblas_icmax, cblas_scsum,cblas_cimatcopy,cblas_comatcopy
-    );
+    cblas_cgemmt);
 @cblasobjsd = (
     cblas_dasum, cblas_daxpy, cblas_dcopy, cblas_ddot,
     cblas_dgbmv, cblas_dgemm, cblas_dgemv, cblas_dger, cblas_dnrm2,
@@ -69,7 +69,7 @@
     cblas_dsyr2k, cblas_dsyr, cblas_dsyrk, cblas_dtbmv, cblas_dtbsv, cblas_dtpmv, cblas_dtpsv,
     cblas_dtrmm, cblas_dtrmv, cblas_dtrsm, cblas_dtrsv, cblas_daxpby, cblas_dgeadd,
     cblas_idamax, cblas_idamin, cblas_idmin, cblas_idmax, cblas_dsum,cblas_dimatcopy,cblas_domatcopy
-    );
+    cblas_dgemmt);
     
 @cblasobjss = (
     cblas_sasum, cblas_saxpy, cblas_saxpby,
@@ -80,7 +80,7 @@
     cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm,
     cblas_strsv, cblas_sgeadd,
     cblas_isamax, cblas_isamin, cblas_ismin, cblas_ismax, cblas_ssum,cblas_simatcopy,cblas_somatcopy
-    );
+    cblas_sgemmt);
 @cblasobjsz = (
     cblas_dzasum, cblas_dznrm2, cblas_zaxpy, cblas_zcopy, cblas_zdotc, cblas_zdotu, cblas_zdscal,
     cblas_zgbmv, cblas_zgemm, cblas_zgemv, cblas_zgerc, cblas_zgeru, cblas_zhbmv, cblas_zhemm,
@@ -90,7 +90,7 @@
     cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub,
     cblas_zaxpby, cblas_zgeadd,
     cblas_izamax, cblas_izamin, cblas_izmin, cblas_izmax, cblas_dzsum,cblas_zimatcopy,cblas_zomatcopy
-);
+    cblas_zgemmt);
 
 @cblasobjs = (  cblas_xerbla );
 
diff --git a/f_check b/f_check
index d071e016e..81f598ffa 100755
--- a/f_check
+++ b/f_check
@@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then
 	      pathf90 pathf95
 	      pgf95 pgf90 pgf77 pgfortran nvfortran
 	      flang egfortran
-              ifort nagfor ifx ftn crayftn"
+              ifort nagfor ifx ftn crayftn armflang"
 
     for list in $lists; do
         for p in $path; do
@@ -85,7 +85,11 @@ else
 	    *Hewlett*)
 		vendor=CRAY
 		openmp='-fopenmp'
-		;;		
+		;;
+   	    *Arm\ F90*)
+		vendor=FLANG
+		openmp='-fopenmp'
+		;;	
             *GNU*|*GCC*)
 
                 v="${data#*GCC: *\) }"
@@ -101,7 +105,14 @@ else
                         *flang*)
                             vendor=FLANG
                             openmp='-fopenmp'
-                            ;;
+			    data=`$compiler -v 2>&1 > /dev/null `
+                	    v="${data#*version *}"
+                	    v="${v%%*.}"
+                	    major="${v%%.*}"
+                	    if [ "$major" -ge 17 ]; then
+                        	vendor=FLANGNEW
+			    fi	
+			    ;;
                         *ifort*|*ifx*)
                             vendor=INTEL
                             openmp='-fopenmp'
@@ -110,6 +121,9 @@ else
                             vendor=PGI
                             openmp='-mp'
                             ;;
+                        *xlf*)
+                            vendor=IBM
+                            ;;
                         *)
                             vendor=G77
                             openmp=''
@@ -148,6 +162,10 @@ else
             *'IBM XL'*)
                 vendor=IBM
                 openmp='-openmp'
+		case "$CC" in *gcc*)
+		bu=_
+		;;
+		esac
                 ;;
             *NAG*)
                 vendor=NAG
@@ -216,6 +234,10 @@ else
             *ppuf*|*xlf*)
                 vendor=IBM
                 openmp='-openmp'
+		case "$CC" in *gcc*)
+		bu=_
+		;;
+		esac
                 ;;
             *open64*)
                 vendor=OPEN64
@@ -355,13 +377,6 @@ if [ -n "$link" ]; then
     	    ;;
     	esac
 
-    	case "$flag" in *-lgomp*)
-    	    case "$CC" in *clang*)
-    	        flag="-lomp"
-    	        ;;
-    	    esac
-    	esac
-
         case "$flag" in -l*)
             case "$flag" in
                 *ibrary*|*gfortranbegin*|*flangmain*|*frtbegin*|*pathfstart*|\
diff --git a/getarch.c b/getarch.c
index 937a8db68..2b5459a5f 100644
--- a/getarch.c
+++ b/getarch.c
@@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <sys/sysinfo.h>
 #include <unistd.h>
 #endif
-#if defined(AIX)
+#if defined(_AIX)
+#include <unistd.h>
+#include <sys/systemcfg.h>
 #include <sys/sysinfo.h>
 #endif
 
@@ -150,6 +152,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_EV4		*/
 /* #define FORCE_EV5		*/
 /* #define FORCE_EV6		*/
+/* #define FORCE_CSKY		*/
+/* #define FORCE_CK860FV		*/
 /* #define FORCE_GENERIC	*/
 
 #ifdef FORCE_P2
@@ -1677,9 +1681,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LIBNAME   "c910v"
 #define CORENAME  "C910V"
 #endif
+#endif
+#ifdef FORCE_x280
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "x280"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-Dx280 " \
+       "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "x280"
+#define CORENAME  "x280"
 #else
 #endif
 
+#ifdef FORCE_RISCV64_ZVL256B
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_ZVL256B"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64_ZVL256B " \
+       "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64_zvl256b"
+#define CORENAME  "RISCV64_ZVL256B"
+#endif
+
+#ifdef FORCE_RISCV64_ZVL128B
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_ZVL128B"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG "-DRISCV64_ZVL128B "                          \
+                   "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+                   "-DL2_SIZE=1048576 -DL2_LINESIZE=32 "         \
+                   "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME  "riscv64_zvl128b"
+#define CORENAME "RISCV64_ZVL128B"
+#endif
 
 #if defined(FORCE_E2K) || defined(__e2k__)
 #define FORCE
@@ -1692,6 +1733,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "generic"
 #endif
 
+#ifdef FORCE_CSKY
+#define FORCE
+#define ARCHITECTURE    "CSKY"
+#define SUBARCHITECTURE "CSKY"
+#define SUBDIRNAME      "csky"
+#define ARCHCONFIG   "-DCSKY" \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "csky"
+#define CORENAME  "CSKY"
+#endif
+
+#ifdef FORCE_CK860FV
+#define FORCE
+#define ARCHITECTURE    "CSKY"
+#define SUBARCHITECTURE "CK860V"
+#define SUBDIRNAME      "csky"
+#define ARCHCONFIG   "-DCK860FV " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "ck860fv"
+#define CORENAME  "CK860FV"
+#endif
+
+
 #ifndef FORCE
 
 #ifdef USER_TARGET
@@ -1766,7 +1834,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
-
 #ifndef OPENBLAS_SUPPORTED
 #error "This arch/CPU is not supported by OpenBLAS."
 #endif
@@ -1805,11 +1872,13 @@ static int get_num_cores(void) {
   
   return count;
 
-#elif defined(AIX)
+#elif defined(_AIX)
   //returns the number of processors which are currently online
   count = sysconf(_SC_NPROCESSORS_ONLN);
   if (count <= 0) count = 2;
-  
+
+  return count;
+
 #else
   return 2;
 #endif
@@ -1831,7 +1900,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("CORE=%s\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__)
     printf("CORE=%s\n", get_corename());
 #endif
 #endif
@@ -1930,15 +1999,15 @@ printf("ELF_VERSION=2\n");
 
 #ifdef MAKE_NB_JOBS
   #if MAKE_NB_JOBS > 0
-    printf("MAKE += -j %d\n", MAKE_NB_JOBS);
+    printf("MAKEFLAGS += -j %d\n", MAKE_NB_JOBS);
   #else
     // Let make use parent -j argument or -j1 if there
     // is no make parent
   #endif
 #elif NO_PARALLEL_MAKE==1
-    printf("MAKE += -j 1\n");
+    printf("MAKEFLAGS += -j 1\n");
 #else
-    printf("MAKE += -j %d\n", get_num_cores());
+    printf("MAKEFLAGS += -j %d\n", get_num_cores());
 #endif
 
     break;
@@ -1979,7 +2048,7 @@ printf("ELF_VERSION=2\n");
 #ifdef FORCE
     printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__)
     printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
 #endif
 #endif
diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt
index 4e082928b..55374674a 100644
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -119,6 +119,7 @@ endif ()
 if (BUILD_BFLOAT16)
 	GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
@@ -130,6 +131,8 @@ endif ()
 foreach (float_type ${FLOAT_TYPES})
 
   if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
+    GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
+
     GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
     GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
     GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
diff --git a/interface/Makefile b/interface/Makefile
index 2ac9663d6..048d679d6 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -270,7 +270,8 @@ CSBLAS1OBJS   = \
 	cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
 	cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
 	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
-	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
+	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
+	cblas_samin.$(SUFFIX)
 
 CSBLAS2OBJS   = \
 	cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@@ -295,7 +296,8 @@ CDBLAS1OBJS   = \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
 	cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
 	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
-	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
+	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
+	cblas_damin.$(SUFFIX)
 
 CDBLAS2OBJS   = \
 	cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@@ -315,7 +317,7 @@ CCBLAS1OBJS   = \
 	cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
 	cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
 	cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
-	cblas_caxpby.$(SUFFIX) \
+	cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
 	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
 
 CCBLAS2OBJS   = \
@@ -340,12 +342,12 @@ CXERBLAOBJ = \
 
 CZBLAS1OBJS   = \
 	cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX)  cblas_zaxpy.$(SUFFIX) \
-	cblas_zcopy.$(SUFFIX) \
+	cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
 	cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
 	cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
 	cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
 	cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
-	cblas_zaxpby.$(SUFFIX) \
+	cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
 	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
 
 
@@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
 ifeq ($(BUILD_BFLOAT16),1)
 sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
-sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h
+sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 endif
 
@@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
 cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)
 
+cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
 cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
 cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
+cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
 cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
 
@@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 
 ifeq ($(BUILD_BFLOAT16),1)
-cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
+cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 endif
 
diff --git a/interface/geadd.c b/interface/geadd.c
index f0befa14a..a2e6d1081 100644
--- a/interface/geadd.c
+++ b/interface/geadd.c
@@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
   info = 0;
 
 
-  if (lda < MAX(1, m))	info = 6;
+  if (lda < MAX(1, m))	info = 5;
   if (ldc < MAX(1, m))	info = 8;
 
   if (n < 0)		info = 2;
@@ -117,8 +117,8 @@ void CNAME(enum CBLAS_ORDER order,
 
     if (ldc < MAX(1, m))  info = 8;
     if (lda < MAX(1, m))  info = 5;
-    if (n < 0)		  info = 2;
-    if (m < 0)		  info = 1;
+    if (n < 0)		  info = 1;
+    if (m < 0)		  info = 2;
   }
 
   if (info >= 0) {
diff --git a/interface/gemm.c b/interface/gemm.c
index 71cc77a1b..4778b641b 100644
--- a/interface/gemm.c
+++ b/interface/gemm.c
@@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = {
 #endif
 #endif
 
+#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
+#define XFEATURE_XTILEDATA 18
+#define ARCH_REQ_XCOMP_PERM 0x1023
+static int openblas_amxtile_permission = 0;
+static int init_amxtile_permission() {
+  long status =
+      syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+  if (status != 0) {
+    fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, "
+                    "Intel Sapphier Rapids), skip sbgemm calculation\n");
+    return -1;
+  }
+  openblas_amxtile_permission = 1;
+  return 0;
+}
+#endif
+
 #ifndef CBLAS
 
 void NAME(char *TRANSA, char *TRANSB,
@@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 
 #endif
 
+#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
+#if defined(DYNAMIC_ARCH)
+  if (gotoblas->need_amxtile_permission &&
+      openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
+    return;
+  }
+#endif
+#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS)
+  if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
+    return;
+  }
+#endif
+#endif  // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
+
   if ((args.m == 0) || (args.n == 0)) return;
 
 #if 0
diff --git a/interface/gemmt.c b/interface/gemmt.c
index 3eed1dfe4..cae00877e 100644
--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@@ -35,29 +35,26 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "common.h"
-#ifdef FUNCTION_PROFILE
-#include "functable.h"
-#endif
 
 #ifndef COMPLEX
 #define SMP_THRESHOLD_MIN 65536.0
 #ifdef XDOUBLE
-#define ERROR_NAME "QGEMT "
+#define ERROR_NAME "QGEMMT "
 #elif defined(DOUBLE)
-#define ERROR_NAME "DGEMT "
+#define ERROR_NAME "DGEMMT "
 #elif defined(BFLOAT16)
-#define ERROR_NAME "SBGEMT "
+#define ERROR_NAME "SBGEMMT "
 #else
-#define ERROR_NAME "SGEMT "
+#define ERROR_NAME "SGEMMT "
 #endif
 #else
 #define SMP_THRESHOLD_MIN 8192.0
 #ifdef XDOUBLE
-#define ERROR_NAME "XGEMT "
+#define ERROR_NAME "XGEMMT "
 #elif defined(DOUBLE)
-#define ERROR_NAME "ZGEMT "
+#define ERROR_NAME "ZGEMMT "
 #else
-#define ERROR_NAME "CGEMT "
+#define ERROR_NAME "CGEMMT "
 #endif
 #endif
 
@@ -68,18 +65,22 @@
 #ifndef CBLAS
 
 void NAME(char *UPLO, char *TRANSA, char *TRANSB,
-	  blasint * M, blasint * N, blasint * K,
+	  blasint * M, blasint * K,
 	  FLOAT * Alpha,
 	  IFLOAT * a, blasint * ldA,
 	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
 {
 
-	blasint m, n, k;
+	blasint m, k;
 	blasint lda, ldb, ldc;
 	int transa, transb, uplo;
 	blasint info;
 
 	char transA, transB, Uplo;
+	blasint nrowa, nrowb;
+#if defined(COMPLEX)
+	blasint ncolb;
+#endif
 	IFLOAT *buffer;
 	IFLOAT *aa, *bb;
 	FLOAT *cc;
@@ -92,7 +93,6 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 	PRINT_DEBUG_NAME;
 
 	m = *M;
-	n = *N;
 	k = *K;
 
 #if defined(COMPLEX)
@@ -158,33 +158,48 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 		uplo = 0;
 	if (Uplo == 'L')
 		uplo = 1;
+	
+	nrowa = m;
+	if (transa & 1) nrowa = k;
+	nrowb = k;
+#if defined(COMPLEX)
+	ncolb = m;
+#endif
+	if (transb & 1) {
+		nrowb = m;
+#if defined(COMPLEX)
+		ncolb = k;
+#endif
+	}
 
 	info = 0;
 
-	if (uplo < 0)
-		info = 14;
-	if (ldc < m)
+	if (ldc < MAX(1, m))
 		info = 13;
+	if (ldb < MAX(1, nrowb))
+		info = 10;
+	if (lda < MAX(1, nrowa))
+		info = 8;
 	if (k < 0)
 		info = 5;
-	if (n < 0)
-		info = 4;
 	if (m < 0)
-		info = 3;
+		info = 4;
 	if (transb < 0)
-		info = 2;
+		info = 3;
 	if (transa < 0)
+		info = 2;
+	if (uplo < 0)
 		info = 1;
 
-	if (info) {
+	if (info != 0) {
 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 		return;
 	}
 #else
 
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
-	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
-	   blasint N, blasint k,
+	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
+	   blasint k,
 #ifndef COMPLEX
 	   FLOAT alpha,
 	   IFLOAT * A, blasint LDA,
@@ -205,17 +220,23 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 	int transa, transb, uplo;
 	blasint info;
-	blasint m, n, lda, ldb;
+	blasint lda, ldb;
 	FLOAT *a, *b;
+#if defined(COMPLEX)
+	blasint nrowb, ncolb;
+#endif
 	XFLOAT *buffer;
 
 	PRINT_DEBUG_CNAME;
 
+	uplo = -1;
 	transa = -1;
 	transb = -1;
 	info = 0;
 
 	if (order == CblasColMajor) {
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
 
 		if (TransA == CblasNoTrans)
 			transa = 0;
@@ -248,9 +269,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 			transb = 3;
 #endif
 
-		m = M;
-		n = N;
-
 		a = (void *)A;
 		b = (void *)B;
 		lda = LDA;
@@ -258,23 +276,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		if (ldc < m)
+		blasint nrowa;
+#if !defined(COMPLEX)
+		blasint nrowb;
+#endif
+		nrowa = m;
+		if (transa & 1) nrowa = k;
+		nrowb = k;
+#if defined(COMPLEX)
+		ncolb = m;
+#endif
+		if (transb & 1) {
+			nrowb = m;
+#if defined(COMPLEX)
+			ncolb = k;
+#endif
+		}
+
+		if (ldc < MAX(1, m))
 			info = 13;
+		if (ldb < MAX(1, nrowb))
+			info = 10;
+		if (lda < MAX(1, nrowa))
+			info = 8;
 		if (k < 0)
 			info = 5;
-		if (n < 0)
-			info = 4;
 		if (m < 0)
-			info = 3;
+			info = 4;
 		if (transb < 0)
-			info = 2;
+			info = 3;
 		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
 			info = 1;
 	}
 
 	if (order == CblasRowMajor) {
-		m = N;
-		n = M;
 
 		a = (void *)B;
 		b = (void *)A;
@@ -282,6 +319,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 		lda = LDB;
 		ldb = LDA;
 
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
 		if (TransB == CblasNoTrans)
 			transa = 0;
 		if (TransB == CblasTrans)
@@ -315,29 +355,42 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 		info = -1;
 
-		if (ldc < m)
+		blasint ncola; 
+#if !defined(COMPLEX)
+		blasint ncolb;
+#endif
+		ncola = m;
+		if (transa & 1) ncola = k;
+		ncolb = k;
+#if defined(COMPLEX)
+		nrowb = m;
+#endif
+
+		if (transb & 1) {
+#if defined(COMPLEX)
+			nrowb = k;
+#endif
+			ncolb = m;
+		}
+
+		if (ldc < MAX(1,m))
 			info = 13;
+		if (ldb < MAX(1, ncolb))
+			info = 8;
+		if (lda < MAX(1, ncola))
+			info = 10;
 		if (k < 0)
 			info = 5;
-		if (n < 0)
-			info = 4;
 		if (m < 0)
-			info = 3;
+			info = 4;
 		if (transb < 0)
 			info = 2;
 		if (transa < 0)
+			info = 3;
+		if (uplo < 0)
 			info = 1;
-
 	}
 
-	uplo = -1;
-	if (Uplo == CblasUpper)
-		uplo = 0;
-	if (Uplo == CblasLower)
-		uplo = 1;
-	if (uplo < 0)
-		info = 14;
-
 	if (info >= 0) {
 		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
 		return;
@@ -407,37 +460,48 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 #endif
 
-	if ((m == 0) || (n == 0))
+	if (m == 0)
 		return;
 
 	IDEBUG_START;
 
-	FUNCTION_PROFILE_START();
+#if defined(COMPLEX)
+	if (transb > 1){
+#ifndef CBLAS
+		IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+#else
+		if (order == CblasColMajor)
+			IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+		if (order == CblasRowMajor)
+			IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+#endif
+	}
+#endif
 
-	const blasint incb = (transb == 0) ? 1 : ldb;
+	const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
 
 	if (uplo == 1) {
-		for (i = 0; i < n; i++) {
-			j = n - i;
+		for (i = 0; i < m; i++) {
+			j = m - i;
 
 			l = j;
 #if defined(COMPLEX)
 			aa = a + i * 2;
 			bb = b + i * ldb * 2;
-			if (transa) {
-				l = k;
+			if (transa & 1) {
 				aa = a + lda * i * 2;
-				bb = b + i * 2;
 			}
+			if (transb & 1)
+				bb = b + i * 2;
 			cc = c + i * 2 * ldc + i * 2;
 #else
 			aa = a + i;
 			bb = b + i * ldb;
-			if (transa) {
-				l = k;
+			if (transa & 1) {
 				aa = a + lda * i;
-				bb = b + i;
 			}
+			if (transb & 1)
+				bb = b + i;
 			cc = c + i * ldc + i;
 #endif
 
@@ -447,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 				       NULL, 0);
 
 			if (alpha_r == ZERO && alpha_i == ZERO)
-				return;
+				continue;
 #else
 			if (beta != ONE)
 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@@ -458,15 +522,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 
 			IDEBUG_START;
 
-			FUNCTION_PROFILE_START();
-
-			buffer_size = j + k + 128 / sizeof(FLOAT);
+			buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
 #endif
 			// for alignment
 			buffer_size = (buffer_size + 3) & ~3;
-			STACK_ALLOC(buffer_size, FLOAT, buffer);
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
 
 #ifdef SMP
 
@@ -479,20 +541,34 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     aa, lda, bb, incb, cc, 1,
 						     buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
+						     aa, lda, bb, incb, cc, 1,
+						     buffer);
 #else
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
 						     bb, incb, cc, 1, buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha, aa, lda,
+						     bb, incb, cc, 1, buffer);
 #endif
 #ifdef SMP
 			} else {
-
+				if (!(transa & 1))
 				(gemv_thread[(int)transa]) (j, k, alpha, aa,
 							    lda, bb, incb, cc,
 							    1, buffer,
 							    nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, aa,
+							    lda, bb, incb, cc,
+							    1, buffer,
+							    nthreads);
 
 			}
 #endif
@@ -501,21 +577,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 		}
 	} else {
 
-		for (i = 0; i < n; i++) {
+		for (i = 0; i < m; i++) {
 			j = i + 1;
 
 			l = j;
 #if defined COMPLEX
 			bb = b + i * ldb * 2;
-			if (transa) {
-				l = k;
+			if (transb & 1) {
 				bb = b + i * 2;
 			}
 			cc = c + i * 2 * ldc;
 #else
 			bb = b + i * ldb;
-			if (transa) {
-				l = k;
+			if (transb & 1) {
 				bb = b + i;
 			}
 			cc = c + i * ldc;
@@ -527,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 				       NULL, 0);
 
 			if (alpha_r == ZERO && alpha_i == ZERO)
-				return;
+				continue;
 #else
 			if (beta != ONE)
 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@@ -537,15 +611,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			IDEBUG_START;
 
-			FUNCTION_PROFILE_START();
-
-			buffer_size = j + k + 128 / sizeof(FLOAT);
+			buffer_size = 2 * (j + k) + 128 / sizeof(FLOAT);
 #ifdef WINDOWS_ABI
 			buffer_size += 160 / sizeof(FLOAT);
 #endif
 			// for alignment
 			buffer_size = (buffer_size + 3) & ~3;
-			STACK_ALLOC(buffer_size, FLOAT, buffer);
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
 
 #ifdef SMP
 
@@ -558,30 +630,39 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 
 #if defined(COMPLEX)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     a, lda, bb, incb, cc, 1,
 						     buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha_r, alpha_i,
+						     a, lda, bb, incb, cc, 1,
+						     buffer);
 #else
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
 						     incb, cc, 1, buffer);
+				else
+				(gemv[(int)transa]) (k, j, 0, alpha, a, lda, bb,
+						     incb, cc, 1, buffer);
 #endif
 
 #ifdef SMP
 			} else {
-
+				if (!(transa & 1))
 				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
 							    bb, incb, cc, 1,
 							    buffer, nthreads);
-
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
+							    bb, incb, cc, 1,
+							    buffer, nthreads);
 			}
 #endif
 
 			STACK_FREE(buffer);
 		}
 	}
-	FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
-			     args.m * args.k + args.k * args.n +
-			     args.m * args.n, 2 * args.m * args.n * args.k);
 
 	IDEBUG_END;
 
diff --git a/interface/gemv.c b/interface/gemv.c
index 1f0763579..2c121f130 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,
 
 #ifdef SMP
 
-  if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
+  if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
     nthreads = 1;
   else
     nthreads = num_cpu_avail(2);
diff --git a/interface/imatcopy.c b/interface/imatcopy.c
index 91975f7f4..69876e31e 100644
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@@ -100,37 +100,42 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < MAX(1,*rows) ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < MAX(1,*cols) ) info = 8;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < MAX(1,*cols) ) info = 8;
+        	if ( trans == BlasTrans    &&  *ldb < MAX(1,*rows) ) info = 8;
 	}
 
-	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
-	if ( order == BlasRowMajor &&  *lda < *cols ) info = 7;
-	if ( *cols <= 0 ) info = 4;
-	if ( *rows <= 0 ) info = 3;
-	if ( trans < 0  ) info = 2;
-	if ( order < 0  ) info = 1;
+	if ( order == BlasColMajor &&  *lda < MAX(1,*rows) ) info = 7;
+	if ( order == BlasRowMajor &&  *lda < MAX(1,*cols) ) info = 7;
+	if ( *cols < 0 ) info = 4;
+	if ( *rows < 0 ) info = 3;
+	if ( trans < 0 ) info = 2;
+	if ( order < 0 ) info = 1;
 
 	if (info >= 0) {
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
+
+	if ((*rows == 0) || (*cols == 0)) return;
+
 #ifdef NEW_IMATCOPY
-    if ( *lda == *ldb && *rows == *cols) {
+    if ( *lda == *ldb ) {
         if ( order == BlasColMajor )
         {
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda );
+                return;
             }
-            else
+            else if ( *rows == *cols )
             {
                 IMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda );
+                return;
             }
         }
         else
@@ -138,26 +143,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda );
+                return;
             }
-            else
+            else if ( *rows == *cols )
             {
                 IMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda );
+                return;
             }
         }
-        return; 
     }
-
 #endif
 
-	if ( *lda >  *ldb )
-		msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT);
-	else
-		msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT);
+	if ( *rows >  *cols )
+            msize = (size_t)(*rows) * (*ldb)  * sizeof(FLOAT);
+    else
+            msize = (size_t)(*cols) * (*ldb)  * sizeof(FLOAT);
 
 	b = malloc(msize);
 	if ( b == NULL )
 	{
-		printf("Memory alloc failed\n");
+		printf("Memory alloc failed in imatcopy\n");
 		exit(1);
 	}
 
@@ -165,26 +170,26 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 	{
 		if ( trans == BlasNoTrans )
 		{
-			OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *ldb, a, *ldb );
+			OMATCOPY_K_CN(*rows, *cols, *alpha, a, *lda, b, *rows );
+			OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0 , b, *rows, a, *ldb );
 		}
 		else
 		{
-			OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
+			OMATCOPY_K_CT(*rows, *cols, *alpha, a, *lda, b, *cols );
+			OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, b, *cols, a, *ldb );
 		}
 	}
 	else
 	{
 		if ( trans == BlasNoTrans )
 		{
-			OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *ldb, a, *ldb );
+			OMATCOPY_K_RN(*rows, *cols, *alpha, a, *lda, b, *cols );
+			OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, b, *cols, a, *ldb );
 		}
 		else
 		{
-			OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *ldb );
-			OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *ldb, a, *ldb );
+			OMATCOPY_K_RT(*rows, *cols, *alpha, a, *lda, b, *rows );
+			OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, b, *rows, a, *ldb );
 		}
 	}
 
diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c
index 175350329..546c2bed2 100644
--- a/interface/lapack/gesv.c
+++ b/interface/lapack/gesv.c
@@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
 
 #ifdef SMP
   args.common = NULL;
-  args.nthreads = num_cpu_avail(4);
+#ifndef DOUBLE
+  if (args.m*args.n < 40000)
+#else
+  if (args.m*args.n < 10000)
+#endif
+	args.nthreads=1;
+  else
+         args.nthreads = num_cpu_avail(4);
 
   if (args.nthreads == 1) {
 #endif
diff --git a/interface/lapack/laswp.c b/interface/lapack/laswp.c
index 0dde33ae3..6544dbc5b 100644
--- a/interface/lapack/laswp.c
+++ b/interface/lapack/laswp.c
@@ -97,7 +97,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
 
   blas_level1_thread(mode, n, k1, k2, dummyalpha,
 		     a, lda, NULL, 0, ipiv, incx,
-		     (int(*)())laswp[flag], nthreads);
+		     (int(*)(void))laswp[flag], nthreads);
   }
 #endif
 
diff --git a/interface/lapack/zlaswp.c b/interface/lapack/zlaswp.c
index b77a40985..7bb4a659e 100644
--- a/interface/lapack/zlaswp.c
+++ b/interface/lapack/zlaswp.c
@@ -96,7 +96,7 @@ int NAME(blasint *N, FLOAT *a, blasint *LDA, blasint *K1, blasint *K2, blasint *
   mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
 #endif
 
-  blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)())laswp[flag], nthreads);
+  blas_level1_thread(mode, n, k1, k2, dummyalpha, a, lda, NULL, 0, ipiv, incx, (int(*)(void))laswp[flag], nthreads);
   }
 #endif
 
diff --git a/interface/max.c b/interface/max.c
index f05977448..7817601b9 100644
--- a/interface/max.c
+++ b/interface/max.c
@@ -46,6 +46,12 @@
 
 #ifdef USE_ABS
 
+#if defined(DOUBLE)
+#define ABS fabs
+#else
+#define ABS fabsf
+#endif
+
 #ifndef USE_MIN
 
 /* ABS & MAX */
@@ -92,6 +98,8 @@
 
 #else
 
+#define ABS
+
 #ifndef USE_MIN
 
 /* MAX */
@@ -130,6 +138,12 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
 
   if (n <= 0) return 0;
 
+#ifndef COMPLEX
+  if (incx == 0) return (ABS(*x));
+#else
+  if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
+#endif
+
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
@@ -145,14 +159,25 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
 
 #else
 
+#ifdef COMPLEX
+FLOAT CNAME(blasint n, void *vx, blasint incx){
+  FLOAT *x = (FLOAT*) vx;
+#else
 FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
-
+#endif
+  
   FLOAT ret;
 
   PRINT_DEBUG_CNAME;
 
   if (n <= 0) return 0;
 
+#ifndef COMPLEX
+  if (incx == 0) return (ABS(*x));
+#else
+  if (incx == 0) return (ABS(*x) + ABS(*(x+1)));
+#endif
+
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
diff --git a/interface/nrm2.c b/interface/nrm2.c
index dc8c08e9a..331ebc3d0 100644
--- a/interface/nrm2.c
+++ b/interface/nrm2.c
@@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
 
   if (n <= 0) return 0.;
 
+#ifndef COMPLEX
+  if (n == 1)
+#ifdef DOUBLE
+    return fabs(x[0]);
+#else
+    return fabsf(x[0]);
+#endif
+#endif
+
+  if (incx < 0) 
+#ifdef COMPLEX    
+    x -= (n - 1) * incx * 2;
+#else
+    x -= (n - 1) * incx;
+#endif
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
@@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
 
   if (n <= 0) return 0.;
 
+  #ifndef COMPLEX
+  if (n == 1)
+#ifdef DOUBLE
+    return fabs(x[0]);
+#else
+    return fabsf(x[0]);
+#endif
+#endif
+
+  if (incx < 0) 
+#ifdef COMPLEX    
+    x -= (n - 1) * incx * 2;
+#else
+    x -= (n - 1) * incx;
+#endif
+  
   IDEBUG_START;
 
   FUNCTION_PROFILE_START();
diff --git a/interface/omatcopy.c b/interface/omatcopy.c
index 59650cfa0..c26446f5c 100644
--- a/interface/omatcopy.c
+++ b/interface/omatcopy.c
@@ -90,27 +90,29 @@ void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTrans    &&  *ldb < MAX(1,*cols) ) info = 9;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans  &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans    &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans  &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTrans    &&  *ldb < MAX(1,*rows) ) info = 9;
 	}
 
-	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
-	if ( order == BlasRowMajor &&  *lda < *cols ) info = 7;
-	if ( *cols <= 0 ) info = 4;
-	if ( *rows <= 0 ) info = 3;
-	if ( trans < 0  ) info = 2;
-	if ( order < 0  ) info = 1;
+	if ( order == BlasColMajor &&  *lda < MAX(1,*rows) ) info = 7;
+	if ( order == BlasRowMajor &&  *lda < MAX(1,*cols) ) info = 7;
+	if ( *cols < 0 ) info = 4;
+	if ( *rows < 0 ) info = 3;
+	if ( trans < 0 ) info = 2;
+	if ( order < 0 ) info = 1;
 
 	if (info >= 0) {
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
 
+	if ((*rows == 0) || (*cols == 0)) return;
+
 	if ( order == BlasColMajor )
 	{
 		if ( trans == BlasNoTrans )
diff --git a/interface/rotg.c b/interface/rotg.c
index 69443a5a0..423ebda21 100644
--- a/interface/rotg.c
+++ b/interface/rotg.c
@@ -1,9 +1,11 @@
 #include <math.h>
+#include <float.h>
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
 #endif
 
+
 #ifndef CBLAS
 
 void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
@@ -14,17 +16,27 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
 
 #endif
 
+#ifdef DOUBLE
+  long double safmin = DBL_MIN;
+#else
+  long double safmin = FLT_MIN;
+#endif
+
 #if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
 
   long double da = *DA;
   long double db = *DB;
   long double c;
   long double s;
-  long double r, roe, z;
+  long double r, z;
+  long double sigma, dascal,dbscal;
 
   long double ada = fabsl(da);
   long double adb = fabsl(db);
-  long double scale = ada + adb;
+  long double maxab = MAX(ada,adb);
+  long double safmax;
+  long double scale;
+
 
 #ifndef CBLAS
   PRINT_DEBUG_NAME;
@@ -32,27 +44,30 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   PRINT_DEBUG_CNAME;
 #endif
 
-  roe = db;
-  if (ada > adb) roe = da;
-
-  if (scale == ZERO) {
+  if (adb == ZERO) {
     *C = ONE;
     *S = ZERO;
-    *DA = ZERO;
     *DB = ZERO;
+  } else if (ada == ZERO) {
+    *C = ZERO;
+    *S = ONE;
+    *DA = *DB;
+    *DB = ONE;
   } else {
-    r = sqrt(da * da + db * db);
-    if (roe < 0) r = -r;
+  safmax = 1./safmin;
+  scale = MIN(MAX(safmin,maxab), safmax);
+    if (ada > adb)
+	sigma = copysign(1.,da);
+    else
+	sigma = copysign(1.,db);
+    dascal = da / scale;
+    dbscal = db / scale;
+    r = sigma * (scale * sqrt(dascal * dascal + dbscal * dbscal));
     c = da / r;
     s = db / r;
     z = ONE;
-    if (da != ZERO) {
-      if (ada > adb){
-	z = s;
-      } else {
-	z = ONE / c;
-      }
-    }
+    if (ada > adb) z = s;
+    if ((ada <= adb) && (c != ZERO)) z = ONE / c;
 
     *C = c;
     *S = s;
@@ -65,11 +80,22 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   FLOAT db = *DB;
   FLOAT c  = *C;
   FLOAT s  = *S;
-  FLOAT r, roe, z;
+  FLOAT sigma;
+  FLOAT r, z;
 
   FLOAT ada = fabs(da);
   FLOAT adb = fabs(db);
-  FLOAT scale = ada + adb;
+  FLOAT maxab = MAX(ada,adb);
+  long double safmax ;
+  FLOAT scale ;
+
+  safmax = 1./safmin;
+  scale = MIN(MAX(safmin,maxab), safmax);
+
+  if (ada > adb)
+	sigma = copysign(1.,da);
+    else
+	sigma = copysign(1.,db);
 
 #ifndef CBLAS
   PRINT_DEBUG_NAME;
@@ -77,20 +103,21 @@ void CNAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
   PRINT_DEBUG_CNAME;
 #endif
 
-  roe = db;
-  if (ada > adb) roe = da;
 
-  if (scale == ZERO) {
+  if (adb == ZERO) {
     *C = ONE;
     *S = ZERO;
-    *DA = ZERO;
     *DB = ZERO;
+  } else if (ada == ZERO) {
+    *C = ZERO;
+    *S = ONE;
+    *DA = *DB;
+    *DB = ONE;
   } else {
     FLOAT aa = da / scale;
     FLOAT bb = db / scale;
 
-    r = scale * sqrt(aa * aa + bb * bb);
-    if (roe < 0) r = -r;
+    r = sigma * scale * sqrt(aa * aa + bb * bb);
     c = da / r;
     s = db / r;
     z = ONE;
diff --git a/interface/rotmg.c b/interface/rotmg.c
index 3a5ca8f95..b8f627221 100644
--- a/interface/rotmg.c
+++ b/interface/rotmg.c
@@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
 	else
 	{
 		dp2 = *dd2 * dy1;
-		if(dp2 == ZERO)
-		{
-			dflag = -TWO;
-			dparam[0] = dflag;
-			return;
-		}
 		dp1 = *dd1 * *dx1;
 		dq2 =  dp2 * dy1;
 		dq1 =  dp1 * *dx1;
@@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
 			dh12 =    dp2 /  dp1;
 
 			du   = ONE - dh12 * dh21;
-			if(du > ZERO)
-			{
-				dflag = ZERO;
-				*dd1  = *dd1 / du;
-				*dd2  = *dd2 / du;
-				*dx1  = *dx1 * du;
-			} else {
-				dflag = -ONE;
-
-				dh11  = ZERO;
-				dh12  = ZERO;
-				dh21  = ZERO;
-				dh22  = ZERO;
-
-				*dd1  = ZERO;
-				*dd2  = ZERO;
-				*dx1  = ZERO;
-			}
+			dflag = ZERO;
+			*dd1  = *dd1 / du;
+			*dd2  = *dd2 / du;
+			*dx1  = *dx1 * du;
 			
 		}
 		else
diff --git a/interface/sbgemmt.c b/interface/sbgemmt.c
new file mode 100644
index 000000000..759af4bfb
--- /dev/null
+++ b/interface/sbgemmt.c
@@ -0,0 +1,447 @@
+/*********************************************************************/
+/* Copyright 2024, The OpenBLAS Project.                             */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+#define SMP_THRESHOLD_MIN 65536.0
+#define ERROR_NAME "SBGEMMT "
+
+#ifndef GEMM_MULTITHREAD_THRESHOLD
+#define GEMM_MULTITHREAD_THRESHOLD 4
+#endif
+
+#ifndef CBLAS
+
+void NAME(char *UPLO, char *TRANSA, char *TRANSB,
+	  blasint * M, blasint * K,
+	  FLOAT * Alpha,
+	  IFLOAT * a, blasint * ldA,
+	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
+{
+
+	blasint m, k;
+	blasint lda, ldb, ldc;
+	int transa, transb, uplo;
+	blasint info;
+
+	char transA, transB, Uplo;
+	blasint nrowa, nrowb;
+	IFLOAT *buffer;
+	IFLOAT *aa, *bb;
+	FLOAT *cc;
+	FLOAT alpha, beta;
+
+	PRINT_DEBUG_NAME;
+
+	m = *M;
+	k = *K;
+
+	alpha = *Alpha;
+	beta = *Beta;
+
+	lda = *ldA;
+	ldb = *ldB;
+	ldc = *ldC;
+
+	transA = *TRANSA;
+	transB = *TRANSB;
+	Uplo = *UPLO;
+	TOUPPER(transA);
+	TOUPPER(transB);
+	TOUPPER(Uplo);
+
+	transa = -1;
+	transb = -1;
+	uplo = -1;
+
+	if (transA == 'N')
+		transa = 0;
+	if (transA == 'T')
+		transa = 1;
+
+	if (transA == 'R')
+		transa = 0;
+	if (transA == 'C')
+		transa = 1;
+
+	if (transB == 'N')
+		transb = 0;
+	if (transB == 'T')
+		transb = 1;
+
+	if (transB == 'R')
+		transb = 0;
+	if (transB == 'C')
+		transb = 1;
+
+	if (Uplo == 'U')
+		uplo = 0;
+	if (Uplo == 'L')
+		uplo = 1;
+	nrowa = m;
+	if (transa & 1) nrowa = k;
+	nrowb = k;
+	if (transb & 1) nrowb = m;
+
+	info = 0;
+
+	if (ldc < MAX(1, m))
+		info = 13;
+	if (ldb < MAX(1, nrowb))
+		info = 10;
+	if (lda < MAX(1, nrowa))
+		info = 8;
+	if (k < 0)
+		info = 5;
+	if (m < 0)
+		info = 4;
+	if (transb < 0)
+		info = 3;
+	if (transa < 0)
+		info = 2;
+	if (uplo < 0)
+		info = 1;
+
+	if (info != 0) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+#else
+
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
+	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
+	   blasint k,
+	   FLOAT alpha,
+	   IFLOAT * A, blasint LDA,
+	   IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
+{
+	IFLOAT *aa, *bb;
+        FLOAT *cc;
+
+	int transa, transb, uplo;
+	blasint info;
+	blasint lda, ldb;
+	IFLOAT *a, *b;
+	XFLOAT *buffer;
+
+	PRINT_DEBUG_CNAME;
+
+	uplo = -1;
+	transa = -1;
+	transb = -1;
+	info = 0;
+
+	if (order == CblasColMajor) {
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
+		if (TransA == CblasNoTrans)
+			transa = 0;
+		if (TransA == CblasTrans)
+			transa = 1;
+
+		if (TransA == CblasConjNoTrans)
+			transa = 0;
+		if (TransA == CblasConjTrans)
+			transa = 1;
+
+		if (TransB == CblasNoTrans)
+			transb = 0;
+		if (TransB == CblasTrans)
+			transb = 1;
+
+		if (TransB == CblasConjNoTrans)
+			transb = 0;
+		if (TransB == CblasConjTrans)
+			transb = 1;
+
+		a = (void *)A;
+		b = (void *)B;
+		lda = LDA;
+		ldb = LDB;
+
+		info = -1;
+
+		blasint nrowa;
+		blasint nrowb;
+		nrowa = m;
+		if (transa & 1) nrowa = k;
+		nrowb = k;
+		if (transb & 1)  nrowb = m;
+
+		if (ldc < MAX(1, m))
+			info = 13;
+		if (ldb < MAX(1, nrowb))
+			info = 10;
+		if (lda < MAX(1, nrowa))
+			info = 8;
+		if (k < 0)
+			info = 5;
+		if (m < 0)
+			info = 4;
+		if (transb < 0)
+			info = 3;
+		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
+			info = 1;
+	}
+
+	if (order == CblasRowMajor) {
+
+		a = (void *)B;
+		b = (void *)A;
+
+		lda = LDB;
+		ldb = LDA;
+
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
+		if (TransB == CblasNoTrans)
+			transa = 0;
+		if (TransB == CblasTrans)
+			transa = 1;
+
+		if (TransB == CblasConjNoTrans)
+			transa = 0;
+		if (TransB == CblasConjTrans)
+			transa = 1;
+
+		if (TransA == CblasNoTrans)
+			transb = 0;
+		if (TransA == CblasTrans)
+			transb = 1;
+
+		if (TransA == CblasConjNoTrans)
+			transb = 0;
+		if (TransA == CblasConjTrans)
+			transb = 1;
+
+		info = -1;
+
+		blasint ncola; 
+		blasint ncolb;
+
+		ncola = m;
+		if (transa & 1) ncola = k;
+		ncolb = k;
+
+		if (transb & 1) {
+			ncolb = m;
+		}
+
+		if (ldc < MAX(1,m))
+			info = 13;
+		if (ldb < MAX(1, ncolb))
+			info = 8;
+		if (lda < MAX(1, ncola))
+			info = 10;
+		if (k < 0)
+			info = 5;
+		if (m < 0)
+			info = 4;
+		if (transb < 0)
+			info = 2;
+		if (transa < 0)
+			info = 3;
+		if (uplo < 0)
+			info = 1;
+	}
+
+	if (info >= 0) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+
+#endif
+	int buffer_size;
+	blasint i, j;
+
+#ifdef SMP
+	int nthreads;
+#endif
+
+
+#ifdef SMP
+	static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *,
+				     BLASLONG, IFLOAT *, BLASLONG, FLOAT,
+				     FLOAT *, BLASLONG, int) = {
+		sbgemv_thread_n, sbgemv_thread_t,
+	};
+#endif
+	int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG,
+		       IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
+	SBGEMV_N, SBGEMV_T,};
+
+
+	if (m == 0)
+		return;
+
+	IDEBUG_START;
+
+	const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
+
+	if (uplo == 1) {
+		for (i = 0; i < m; i++) {
+			j = m - i;
+
+			aa = a + i;
+			bb = b + i * ldb;
+			if (transa & 1) {
+				aa = a + lda * i;
+			}
+			if (transb & 1)
+				bb = b + i;
+			cc = c + i * ldc + i;
+
+#if 0
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+
+			IDEBUG_START;
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+				if (!(transa & 1))
+				(gemv[(int)transa]) (j, k, alpha, aa, lda,
+						     bb, incb, beta, cc, 1);
+				else
+				(gemv[(int)transa]) (k, j, alpha, aa, lda,
+						     bb, incb, beta, cc, 1);
+
+#ifdef SMP
+			} else {
+				if (!(transa & 1))
+				(gemv_thread[(int)transa]) (j, k, alpha, aa,
+							    lda, bb, incb, beta, cc,
+							    1, nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, aa,
+							    lda, bb, incb, beta, cc,
+							    1, nthreads);
+
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	} else {
+
+		for (i = 0; i < m; i++) {
+			j = i + 1;
+
+			bb = b + i * ldb;
+			if (transb & 1) {
+				bb = b + i;
+			}
+			cc = c + i * ldc;
+
+#if 0
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+			IDEBUG_START;
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+				if (!(transa & 1))
+				(gemv[(int)transa]) (j, k, alpha, a, lda, bb,
+						     incb, beta, cc, 1);
+				else
+				(gemv[(int)transa]) (k, j, alpha, a, lda, bb,
+						     incb, beta, cc, 1);
+
+#ifdef SMP
+			} else {
+				if (!(transa & 1))
+				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
+							    bb, incb, beta, cc, 1,
+							    nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
+							    bb, incb, beta, cc, 1,
+							    nthreads);
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	}
+
+	IDEBUG_END;
+
+	return;
+}
diff --git a/interface/symm.c b/interface/symm.c
index 3e65e69b1..3e6e0fd48 100644
--- a/interface/symm.c
+++ b/interface/symm.c
@@ -166,7 +166,7 @@ void NAME(char *SIDE, char *UPLO,
   int nodes;
 #endif
 # if defined(SMP)
-  int MN;
+  double MN;
 #endif
   blasint info;
   int side;
@@ -264,7 +264,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
   int nodes;
 #endif
 #if defined(SMP)
-  int MN;
+  double MN;
 #endif
 
   PRINT_DEBUG_CNAME;
diff --git a/interface/syrk.c b/interface/syrk.c
index 3b056aec8..69f2328a4 100644
--- a/interface/syrk.c
+++ b/interface/syrk.c
@@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS,
   FLOAT *sa, *sb;
 
 #ifdef SMP
-  int NNK;
+  double NNK;
 #ifdef USE_SIMPLE_THREADED_LEVEL3
 #ifndef COMPLEX
 #ifdef XDOUBLE
@@ -232,7 +232,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
   FLOAT *sa, *sb;
 
 #ifdef SMP
-int NNK;
+double NNK;
 
 #ifdef USE_SIMPLE_THREADED_LEVEL3
 #ifndef COMPLEX
diff --git a/interface/zaxpby.c b/interface/zaxpby.c
index 3a4db7403..e5065270d 100644
--- a/interface/zaxpby.c
+++ b/interface/zaxpby.c
@@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifndef CBLAS
 
-void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
+void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY)
 {
 
   blasint n    = *N;
   blasint incx = *INCX;
   blasint incy = *INCY;
+  FLOAT* ALPHA = (FLOAT*) VALPHA;
+  FLOAT* BETA = (FLOAT*) VBETA;
 
 #else
 
diff --git a/interface/zgeadd.c b/interface/zgeadd.c
index 7124cf230..de71f27b8 100644
--- a/interface/zgeadd.c
+++ b/interface/zgeadd.c
@@ -66,7 +66,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
   info = 0;
 
 
-  if (lda < MAX(1, m))	info = 6;
+  if (lda < MAX(1, m))	info = 5;
   if (ldc < MAX(1, m))	info = 8;
 
   if (n < 0)		info = 2;
@@ -115,8 +115,8 @@ void CNAME(enum CBLAS_ORDER order,
 
     if (ldc < MAX(1, m))  info = 8;
     if (lda < MAX(1, m))  info = 5;
-    if (n < 0)		  info = 2;
-    if (m < 0)		  info = 1;
+    if (n < 0)		  info = 1;
+    if (m < 0)		  info = 2;
   }
 
   if (info >= 0) {
diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c
index ecda5ef4e..b879c9ac2 100644
--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@@ -101,51 +101,59 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans      &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasConj         &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans        &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTransConj    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans      &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasConj         &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTrans        &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTransConj    &&  *ldb < MAX(1,*cols) ) info = 9;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans    &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasConj       &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans      &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTransConj  &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans    &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasConj       &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTrans      &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTransConj  &&  *ldb < MAX(1,*rows) ) info = 9;
 	}
 
-	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
-	if ( order == BlasRowMajor &&  *lda < *cols ) info = 7;
-	if ( *cols <= 0 ) info = 4;
-	if ( *rows <= 0 ) info = 3;
-	if ( trans < 0  ) info = 2;
-	if ( order < 0  ) info = 1;
+	if ( order == BlasColMajor &&  *lda < MAX(1,*rows) ) info = 7;
+	if ( order == BlasRowMajor &&  *lda < MAX(1,*cols) ) info = 7;
+	if ( *cols < 0 ) info = 4;
+	if ( *rows < 0 ) info = 3;
+	if ( trans < 0 ) info = 2;
+	if ( order < 0 ) info = 1;
 
 	if (info >= 0) {
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
+
+	if ((*rows == 0) || (*cols == 0)) return;
+
 #ifdef NEW_IMATCOPY
-    if (*lda == *ldb && *cols == *rows) {
+    if (*lda == *ldb ) {
         if ( order == BlasColMajor )
         {
 
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
             if ( trans == BlasConj )
             {
                 IMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTrans )
+            if ( trans == BlasTrans && *rows == *cols )
             {
                 IMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTransConj )
+            if ( trans == BlasTransConj && *rows == *cols )
             {
                 IMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
+
         }
         else
         {
@@ -153,67 +161,62 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
             if ( trans == BlasNoTrans )
             {
                 IMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
             if ( trans == BlasConj )
             {
                 IMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTrans )
+            if ( trans == BlasTrans && *rows == *cols )
             {
                 IMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
-            if ( trans == BlasTransConj )
+            if ( trans == BlasTransConj && *rows == *cols )
             {
                 IMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda );
+                return;
             }
+
         }
-        return; 
     }
 #endif
 
-	if ( *lda >  *ldb )
-                msize = (size_t)(*lda) * (*ldb)  * sizeof(FLOAT) * 2;
+		if ( *rows >  *cols )
+                msize = (size_t)(*rows) * (*ldb)  * sizeof(FLOAT) * 2;
         else
-                msize = (size_t)(*ldb) * (*ldb)  * sizeof(FLOAT) * 2;
-
-        b = malloc(msize);
-        if ( b == NULL )
-        {
-                printf("Memory alloc failed in zimatcopy\n");
-                exit(1);
-        }
+                msize = (size_t)(*cols) * (*ldb)  * sizeof(FLOAT) * 2;
 
+	b = malloc(msize);
+	if ( b == NULL )
+	{
+		printf("Memory alloc failed in zimatcopy\n");
+		exit(1);
+	}
 
 	if ( order == BlasColMajor )
 	{
 
 		if ( trans == BlasNoTrans )
 		{
-	  		OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+	  		OMATCOPY_K_CN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
-		if ( trans == BlasConj )
+		else if ( trans == BlasConj )
 		{
-			OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_CNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_CN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
-		if ( trans == BlasTrans )
+		else if ( trans == BlasTrans )
 		{
-			OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_CT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
-		if ( trans == BlasTransConj )
+		else if ( trans == BlasTransConj )
 		{
-			OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_CTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_CN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
 
 	}
@@ -222,34 +225,27 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 
 		if ( trans == BlasNoTrans )
 		{
-			OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RN(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
-		if ( trans == BlasConj )
+		else if ( trans == BlasConj )
 		{
-			OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RNC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *cols );
+	  		OMATCOPY_K_RN(*rows, *cols, (FLOAT) 1.0, (FLOAT) 0.0 , b, *cols, a, *ldb );
 		}
-		if ( trans == BlasTrans )
+		else if ( trans == BlasTrans )
 		{
-			OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RT(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
-		if ( trans == BlasTransConj )
+		else if ( trans == BlasTransConj )
 		{
-			OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *ldb );
-	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *ldb, a, *ldb );
-			free(b);
-			return;
+			OMATCOPY_K_RTC(*rows, *cols, alpha[0], alpha[1], a, *lda, b, *rows );
+	  		OMATCOPY_K_RN(*cols, *rows, (FLOAT) 1.0, (FLOAT) 0.0 , b, *rows, a, *ldb );
 		}
 
 	}
+
 	free(b);
 	return;
 
diff --git a/interface/zomatcopy.c b/interface/zomatcopy.c
index 7345633a2..7121711d8 100644
--- a/interface/zomatcopy.c
+++ b/interface/zomatcopy.c
@@ -92,31 +92,33 @@ void CNAME(enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
 #endif
 	if ( order == BlasColMajor)
 	{
-        	if ( trans == BlasNoTrans      &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasConj         &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTrans        &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTransConj    &&  *ldb < *cols ) info = 9;
+        	if ( trans == BlasNoTrans      &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasConj         &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTrans        &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTransConj    &&  *ldb < MAX(1,*cols) ) info = 9;
 	}
 	if ( order == BlasRowMajor)
 	{
-        	if ( trans == BlasNoTrans    &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasConj       &&  *ldb < *cols ) info = 9;
-        	if ( trans == BlasTrans      &&  *ldb < *rows ) info = 9;
-        	if ( trans == BlasTransConj  &&  *ldb < *rows ) info = 9;
+        	if ( trans == BlasNoTrans    &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasConj       &&  *ldb < MAX(1,*cols) ) info = 9;
+        	if ( trans == BlasTrans      &&  *ldb < MAX(1,*rows) ) info = 9;
+        	if ( trans == BlasTransConj  &&  *ldb < MAX(1,*rows) ) info = 9;
 	}
 
-	if ( order == BlasColMajor &&  *lda < *rows ) info = 7;
-	if ( order == BlasRowMajor &&  *lda < *cols ) info = 7;
-	if ( *cols <= 0 ) info = 4;
-	if ( *rows <= 0 ) info = 3;
-	if ( trans < 0  ) info = 2;
-	if ( order < 0  ) info = 1;
+	if ( order == BlasColMajor &&  *lda < MAX(1,*rows) ) info = 7;
+	if ( order == BlasRowMajor &&  *lda < MAX(1,*cols) ) info = 7;
+	if ( *cols < 0 ) info = 4;
+	if ( *rows < 0 ) info = 3;
+	if ( trans < 0 ) info = 2;
+	if ( order < 0 ) info = 1;
 
 	if (info >= 0) {
     		BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
     		return;
   	}
 
+	if ((*rows == 0) || (*cols == 0)) return;
+
 	if ( order == BlasColMajor )
 	{
 
diff --git a/interface/zrotg.c b/interface/zrotg.c
index 123f4da85..8acc3c9b2 100644
--- a/interface/zrotg.c
+++ b/interface/zrotg.c
@@ -1,9 +1,11 @@
 #include <math.h>
+#include <float.h>
 #include "common.h"
 #ifdef FUNCTION_PROFILE
 #include "functable.h"
 #endif
 
+
 #ifndef CBLAS
 void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
 
@@ -14,53 +16,26 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
     FLOAT *S  = (FLOAT*) VS;
 #endif /* CBLAS */
 
-#if defined(__i386__) || defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) || defined(_M_IX86)
-
-  long double da_r = *(DA + 0);
-  long double da_i = *(DA + 1);
-  long double db_r = *(DB + 0);
-  long double db_i = *(DB + 1);
-  long double r;
-
-  long double ada = fabsl(da_r) + fabsl(da_i);
-
-  PRINT_DEBUG_NAME;
-
-  IDEBUG_START;
-
-  FUNCTION_PROFILE_START();
-
-  if (ada == ZERO) {
-    *C        = ZERO;
-    *(S  + 0) = ONE;
-    *(S  + 1) = ZERO;
-    *(DA + 0) = db_r;
-    *(DA + 1) = db_i;
-  } else {
-    long double alpha_r, alpha_i;
-
-    ada = sqrt(da_r * da_r + da_i * da_i);
-
-    r = sqrt(da_r * da_r + da_i * da_i + db_r * db_r + db_i * db_i);
+#ifdef DOUBLE
+  long double safmin = DBL_MIN;
+  long double rtmin = sqrt(DBL_MIN/DBL_EPSILON);
+#else
+  long double safmin = FLT_MIN;
+  long double rtmin = sqrt(FLT_MIN/FLT_EPSILON);
+#endif
 
-    alpha_r = da_r / ada;
-    alpha_i = da_i / ada;
 
-    *(C + 0)  = ada / r;
-    *(S + 0)  = (alpha_r * db_r + alpha_i *db_i) / r;
-    *(S + 1)  = (alpha_i * db_r - alpha_r *db_i) / r;
-    *(DA + 0) = alpha_r * r;
-    *(DA + 1) = alpha_i * r;
-  }
-#else
-  FLOAT da_r = *(DA + 0);
-  FLOAT da_i = *(DA + 1);
-  FLOAT db_r = *(DB + 0);
-  FLOAT db_i = *(DB + 1);
-  FLOAT r;
+  FLOAT da_r = *(DA+0);
+  FLOAT da_i = *(DA+1);
+  FLOAT db_r = *(DB+0);
+  FLOAT db_i = *(DB+1);
+  //long double r;
+  FLOAT S1[2];
+  FLOAT R[2];
+  long double d;
 
-  FLOAT ada = fabs(da_r) + fabs(da_i);
-  FLOAT adb;
+  FLOAT ada =  da_r * da_r + da_i * da_i; 
+  FLOAT adb =  db_r * db_r + db_i * db_i; 
 
   PRINT_DEBUG_NAME;
 
@@ -68,69 +43,140 @@ void CNAME(void *VDA, void *VDB, FLOAT *C, void *VS) {
 
   FUNCTION_PROFILE_START();
 
-  if (ada == ZERO) {
-    *C        = ZERO;
-    *(S  + 0) = ONE;
+  if (db_r == ZERO && db_i == ZERO) {
+    *C        = ONE;
+    *(S  + 0) = ZERO;
     *(S  + 1) = ZERO;
-    *(DA + 0) = db_r;
-    *(DA + 1) = db_i;
-  } else {
-    FLOAT scale;
-    FLOAT aa_r, aa_i, bb_r, bb_i;
-    FLOAT alpha_r, alpha_i;
-
-    aa_r = fabs(da_r);
-    aa_i = fabs(da_i);
-
-    if (aa_i > aa_r) {
-      aa_r = fabs(da_i);
-      aa_i = fabs(da_r);
-    }
-
-    if (aa_r == ZERO) {
-	ada = 0.;
-    } else {
-        scale = (aa_i / aa_r);
-        ada = aa_r * sqrt(ONE + scale * scale);
-    }
-
-    bb_r = fabs(db_r);
-    bb_i = fabs(db_i);
-
-    if (bb_i > bb_r) {
-      bb_r = fabs(bb_i);
-      bb_i = fabs(bb_r);
-    }
-
-    if (bb_r == ZERO) {
-	adb = 0.;
-    } else {
-    	scale = (bb_i / bb_r);
-    	adb = bb_r * sqrt(ONE + scale * scale);
-    }
-    scale = ada + adb;
-
-    aa_r    = da_r / scale;
-    aa_i    = da_i / scale;
-    bb_r    = db_r / scale;
-    bb_i    = db_i / scale;
-
-    r = scale * sqrt(aa_r * aa_r + aa_i * aa_i + bb_r * bb_r + bb_i * bb_i);
-
-    alpha_r = da_r / ada;
-    alpha_i = da_i / ada;
-
-    *(C + 0)  = ada / r;
-    *(S + 0)  = (alpha_r * db_r + alpha_i *db_i) / r;
-    *(S + 1)  = (alpha_i * db_r - alpha_r *db_i) / r;
-    *(DA + 0) = alpha_r * r;
-    *(DA + 1) = alpha_i * r;
+    return;
   }
-#endif
 
-  FUNCTION_PROFILE_END(4, 4, 4);
-
-  IDEBUG_END;
-
-  return;
+  long double safmax = 1./safmin;
+#if defined DOUBLE
+  long double rtmax = safmax /DBL_EPSILON;
+#else
+  long double rtmax = safmax /FLT_EPSILON;
+#endif
+		*(S1 + 0) = *(DB + 0);
+		*(S1 + 1) = *(DB + 1) *-1;
+	if (da_r == ZERO && da_i == ZERO) {
+	    *C = ZERO;
+	    if (db_r == ZERO) {
+		    (*DA) = fabsl(db_i);
+		*S = *S1 /(*DA);
+		*(S+1) = *(S1+1) /(*DA);
+		return;
+	    } else if ( db_i == ZERO) {
+		    *DA = fabsl(db_r);
+		*S = *S1 /(*DA);
+		*(S+1) = *(S1+1) /(*DA);
+		return;
+	    } else {
+	        long double g1 = MAX( fabsl(db_r), fabsl(db_i));
+	        rtmax =sqrt(safmax/2.);
+	        if (g1 > rtmin && g1 < rtmax) { // unscaled
+		    d = sqrt(adb);
+		    *S = *S1 /d;
+		    *(S+1) = *(S1+1) /d;
+		    *DA = d ;
+		    *(DA+1) = ZERO;
+		    return;
+    	        } else { // scaled algorithm
+		  long double u = MIN ( safmax, MAX ( safmin, g1));
+		    FLOAT gs_r = db_r/u;
+		    FLOAT gs_i = db_i/u;
+		    d = sqrt ( gs_r*gs_r + gs_i*gs_i);
+		    *S = gs_r / d;
+		    *(S + 1) = (gs_i * -1) / d;
+		    *DA = d * u;
+		    *(DA+1) = ZERO;
+		    return;
+	        }
+	    }
+	} else {
+	       FLOAT f1 = MAX ( fabsl(da_r), fabsl(da_i));
+	       FLOAT g1 = MAX ( fabsl(db_r), fabsl(db_i));
+	       rtmax = sqrt(safmax / 4.);
+	       if ( f1 > rtmin && f1 < rtmax && g1 > rtmin && g1 < rtmax) { //unscaled
+		    long double h = ada + adb;
+	   	    double adahsq = sqrt(ada * h);
+		    if (ada >= h *safmin) {
+			*C = sqrt(ada/h);
+			*R = *DA / *C;
+			*(R+1) = *(DA+1) / *C;
+			rtmax *= 2.;
+			if ( ada > rtmin && h < rtmax) { // no risk of intermediate overflow
+				*S = *S1 * (*DA / adahsq) - *(S1+1)* (*(DA+1)/adahsq);
+				*(S+1) = *S1 * (*(DA+1) / adahsq) + *(S1+1) * (*DA/adahsq);
+			} else {
+				*S = *S1 * (*R/h) - *(S1+1) * (*(R+1)/h);
+				*(S+1) = *S1 * (*(R+1)/h) + *(S1+1) * (*(R)/h);
+			}
+	    	    } else {
+		        *C = ada / adahsq;
+		        if (*C >= safmin) {
+			    *R = *DA / *C;
+			    *(R+1) = *(DA+1) / *C;
+			} else {
+			    *R = *DA * (h / adahsq);
+			    *(R+1) = *(DA+1) * (h / adahsq);
+			}
+		        *S = *S1 * ada / adahsq;
+		    	*(S+1) = *(S1+1) * ada / adahsq;
+		    }
+		    *DA=*R;
+		    *(DA+1)=*(R+1);
+		    return;
+	        } else {	// scaled
+		   FLOAT fs_r, fs_i, gs_r, gs_i;
+		long double v,w,f2,g2,h;
+		long double u = MIN ( safmax, MAX ( safmin, MAX(f1,g1)));
+		    gs_r = db_r/u;
+		    gs_i = db_i/u;
+		    g2 = sqrt ( gs_r*gs_r + gs_i*gs_i);
+		    if (f1 /u < rtmin) {
+			v = MIN (safmax, MAX (safmin, f1));
+			w = v / u;
+			fs_r = *DA/ v;
+			fs_i = *(DA+1) / v;
+		        f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
+		        h = f2 * w * w + g2;		
+		    } else { // use same scaling for both
+			w = 1.;
+			fs_r = *DA/ u;
+			fs_i = *(DA+1) / u;
+		        f2 = sqrt ( fs_r*fs_r + fs_i*fs_i);
+			h = f2 + g2;
+		    }
+		    if ( f2 >= h * safmin) {
+			    *C = sqrt ( f2 / h );
+			    *DA = fs_r / *C;
+			    *(DA+1) = fs_i / *C;
+			    rtmax *= 2;
+			    if ( f2 > rtmin && h < rtmax) {
+				    *S = gs_r * (fs_r /sqrt(f2*h)) - gs_i * (fs_i / sqrt(f2*h));
+				    *(S+1) = gs_r * (fs_i /sqrt(f2*h)) + gs_i * -1. * (fs_r / sqrt(f2*h));
+			    } else {
+				    *S = gs_r * (*DA/h) - gs_i * (*(DA+1) / h);
+				    *(S+1) = gs_r * (*(DA+1) /h) + gs_i * -1. * (*DA / h);
+			    }
+		    } else { // intermediates might overflow
+			d = sqrt ( f2 * h);
+			*C = f2 /d;
+			if (*C >= safmin) {
+				*DA = fs_r / *C;
+				*(DA+1) = fs_i / *C;
+			} else {
+				*DA = fs_r * (h / d);
+				*(DA+1) = fs_i / (h / d);
+			}
+			*S = gs_r * (fs_r /d) - gs_i * (fs_i / d);
+			*(S+1) = gs_r * (fs_i /d) + gs_i * -1. * (fs_r / d);
+	            }
+		    *C *= w;
+		    *DA *= u;
+		    *(DA+1) *= u;
+		    return;
+		}
+	}
 }
+	
diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt
index 60314eedb..74e6760c2 100644
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -1349,6 +1349,9 @@ endif ()
     set_target_properties(kernel${TSUFFIX} PROPERTIES COMPILE_FLAGS "${KERNEL_DEFINITIONS}")
     get_target_property(KERNEL_INCLUDE_DIRECTORIES kernel${TSUFFIX} INCLUDE_DIRECTORIES)
     set_target_properties(kernel${TSUFFIX} PROPERTIES INCLUDE_DIRECTORIES "${KERNEL_INCLUDE_DIRECTORIES};${TARGET_CONF_DIR}")
+    if (USE_GEMM3M)
+      target_compile_definitions(kernel${TSUFFIX} PRIVATE USE_GEMM3M)
+    endif()
 endfunction ()
 
 
diff --git a/kernel/Makefile b/kernel/Makefile
index 977886044..3f9afd3fa 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,12 +5,6 @@ endif
 TOPDIR	= ..
 include $(TOPDIR)/Makefile.system
 
-ifeq ($(ARCH), power)
-ifeq ($(C_COMPILER), CLANG)
- override CFLAGS += -fno-integrated-as
-endif
-endif
-
 AVX2OPT = 
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
@@ -33,7 +27,7 @@ endif
 ifdef TARGET_CORE
 ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
- ifeq ($(GCCVERSIONGTEQ10), 1) 
+ ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))
   override CFLAGS += -march=sapphirerapids
  else 
   override CFLAGS += -march=skylake-avx512 -mavx512f
@@ -48,7 +42,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
  endif
 else ifeq ($(TARGET_CORE), COOPERLAKE)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
- ifeq ($(GCCVERSIONGTEQ10), 1) 
+ ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9)))
   override CFLAGS += -march=cooperlake
  else 
   override CFLAGS += -march=skylake-avx512 -mavx512f
@@ -77,6 +71,12 @@ else ifeq ($(TARGET_CORE), ZEN)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
 else ifeq ($(TARGET_CORE), LOONGSON3R4)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
+else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),)
+ ifeq ($(C_COMPILER), PGI)
+  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics
+ else
+  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
+ endif
 else
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
 endif
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index bea6cb048..863f376e9 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -35,6 +35,12 @@ USE_TRMM = 1
 endif
 endif
 
+ifneq ($(DYNAMIC_ARCH), 1)
+ifeq ($(TARGET), MIPS64_GENERIC)
+USE_TRMM = 1
+endif
+endif
+
 ifeq ($(CORE), HASWELL)
 USE_TRMM = 1
 endif
@@ -55,6 +61,15 @@ ifeq ($(CORE), ZEN)
 USE_TRMM = 1
 endif
 
+ifeq ($(OS), AIX)
+M4VERSION := $(shell m4 --version < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?)
+ifeq ($(M4VERSION), 0)
+M4_AIX := m4 -l16384
+else
+M4_AIX := m4 -B16384
+endif
+$(info $$var is [${$(M4_AIX)}])
+endif
 ifeq ($(CORE), POWER8)
 ifeq ($(BINARY64),1)
 USE_TRMM = 1
@@ -167,7 +182,7 @@ ifeq ($(BUILD_BFLOAT16),1)
 SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
 endif
 
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" ""
+ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
 SBLASOBJS	+= \
 	sgemm_beta$(TSUFFIX).$(SUFFIX) \
 	strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
@@ -176,7 +191,7 @@ SBLASOBJS	+= \
 	strsm_kernel_RN$(TSUFFIX).$(SUFFIX) strsm_kernel_RT$(TSUFFIX).$(SUFFIX) 
 endif
 
-ifeq ($(BUILD_DOUBLE),1)
+ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
 DBLASOBJS	+= \
 	dgemm_beta$(TSUFFIX).$(SUFFIX) \
 	dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
@@ -192,7 +207,7 @@ QBLASOBJS	+= \
 	qtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) \
 	qtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) qtrsm_kernel_RT$(TSUFFIX).$(SUFFIX)
 
-ifeq ($(BUILD_COMPLEX),1)
+ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
 CBLASOBJS	+= \
 	ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) \
 	ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) \
@@ -240,7 +255,7 @@ XBLASOBJS    +=  xgemm3m_kernel$(TSUFFIX).$(SUFFIX)
 
 endif
 
-ifeq ($(BUILD_SINGLE),1)
+ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
 SBLASOBJS += \
 	strmm_iunucopy$(TSUFFIX).$(SUFFIX) strmm_iunncopy$(TSUFFIX).$(SUFFIX) \
 	strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \
@@ -249,10 +264,7 @@ SBLASOBJS += \
 	strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \
 	strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \
 	strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \
-	strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) 
-endif
-ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" ""
-SBLASOBJS += \
+	strmm_oltucopy$(TSUFFIX).$(SUFFIX) strmm_oltncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_iunucopy$(TSUFFIX).$(SUFFIX) strsm_iunncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_ilnucopy$(TSUFFIX).$(SUFFIX) strsm_ilnncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_iutucopy$(TSUFFIX).$(SUFFIX) strsm_iutncopy$(TSUFFIX).$(SUFFIX) \
@@ -260,10 +272,7 @@ SBLASOBJS += \
 	strsm_ounucopy$(TSUFFIX).$(SUFFIX) strsm_ounncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_olnucopy$(TSUFFIX).$(SUFFIX) strsm_olnncopy$(TSUFFIX).$(SUFFIX) \
 	strsm_outucopy$(TSUFFIX).$(SUFFIX) strsm_outncopy$(TSUFFIX).$(SUFFIX) \
-	strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) 
-endif
-ifeq ($(BUILD_SINGLE),1)
-SBLASOBJS += \
+	strsm_oltucopy$(TSUFFIX).$(SUFFIX) strsm_oltncopy$(TSUFFIX).$(SUFFIX) \
 	ssymm_iutcopy$(TSUFFIX).$(SUFFIX) ssymm_iltcopy$(TSUFFIX).$(SUFFIX) \
 	ssymm_outcopy$(TSUFFIX).$(SUFFIX) ssymm_oltcopy$(TSUFFIX).$(SUFFIX)
 endif
@@ -385,7 +394,7 @@ XBLASOBJS += \
 
 ifeq ($(USE_GEMM3M), 1)
 
-ifeq ($(BUILD_COMPLEX),1)
+ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" ""
 CBLASOBJS += \
 	cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) \
 	cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) \
@@ -628,15 +637,7 @@ $(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY)
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY)
-
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s
-	m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@
-	rm sbgemmotcopy.s sbgemmotcopy_nomacros.s
-else
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
 
 ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
 
@@ -644,14 +645,7 @@ $(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s
-	m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@
-	rm sbgemmitcopy.s sbgemmitcopy_nomacros.s
-else
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
-endif
 
 endif
 endif
@@ -662,7 +656,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
 $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s
-	m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
+	$(M4_AIX) sgemmotcopy.s > sgemmotcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
 	rm sgemmotcopy.s sgemmotcopy_nomacros.s
 else
@@ -678,7 +672,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
 $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s
-	m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
+	$(M4_AIX) sgemmitcopy.s > sgemmitcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
 	rm sgemmitcopy.s sgemmitcopy_nomacros.s
 else
@@ -690,7 +684,7 @@ endif
 $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s
-	m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
+	$(M4_AIX) dgemm_ncopy.s > dgemm_ncopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
 	rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
 else
@@ -708,7 +702,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
 $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s
-	m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
+	$(M4_AIX) dgemm_itcopy.s > dgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
 	rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
 else
@@ -751,7 +745,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
 $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s
-	m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
+	$(M4_AIX) cgemm_itcopy.s > cgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
 	rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
 else
@@ -774,7 +768,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
 $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s
-	m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
+	$(M4_AIX) zgemm_itcopy.s > zgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
 	rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
 else
@@ -806,7 +800,7 @@ endif
 $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX  $< -o - > sgemm_kernel$(TSUFFIX).s
-	m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
+	$(M4_AIX) sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
 	rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
 else
@@ -823,20 +817,13 @@ endif
 ifeq ($(BUILD_BFLOAT16), 1)
 
 $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
-ifeq ($(OS), AIX)
-	$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX  $< -o - > sbgemm_kernel$(TSUFFIX).s
-	m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s
-	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@
-	rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s
-else
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
-endif
 
 $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
-	m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
+	$(M4_AIX) dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
 	rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
 else
@@ -849,7 +836,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP
 $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s
-	m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
+	$(M4_AIX) cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
 	rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
 else
@@ -859,7 +846,7 @@ endif
 $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s
-	m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
+	$(M4_AIX) cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
 	rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
 else
@@ -869,7 +856,7 @@ endif
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC  $< -o - > cgemm_kernel_r.s
-	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
+	$(M4_AIX) cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
 	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
 else
@@ -879,7 +866,7 @@ endif
 $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s
-	m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
+	$(M4_AIX) cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
 	rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
 else
@@ -889,7 +876,7 @@ endif
 $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s
-	m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
+	$(M4_AIX) zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
 	rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
 else ifeq ($(CORE),SANDYBRIDGE)
@@ -901,7 +888,7 @@ endif
 $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s
-	m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
+	$(M4_AIX) zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
 	rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
 else ifeq ($(CORE),SANDYBRIDGE)
@@ -913,7 +900,7 @@ endif
 $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s
-	m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
+	$(M4_AIX) zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
 	rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
 else ifeq ($(CORE),SANDYBRIDGE)
@@ -925,7 +912,7 @@ endif
 $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s
-	m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
+	$(M4_AIX) zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
 	rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
 else ifeq ($(CORE),SANDYBRIDGE)
@@ -951,7 +938,7 @@ ifdef USE_TRMM
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s	
-	m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
+	$(M4_AIX) strmmkernel_ln.s > strmmkernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
 	rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
 else
@@ -961,7 +948,7 @@ endif
 $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s	
-	m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
+	$(M4_AIX) strmmkernel_lt.s > strmmkernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
 	rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
 else
@@ -971,7 +958,7 @@ endif
 $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s	
-	m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
+	$(M4_AIX) strmmkernel_rn.s > strmmkernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
 	rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
 else
@@ -981,7 +968,7 @@ endif
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
+	$(M4_AIX) strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
 else
@@ -991,7 +978,7 @@ endif
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s
-	m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
+	$(M4_AIX) dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
 	rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
 else
@@ -1001,7 +988,7 @@ endif
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s
-	m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
+	$(M4_AIX) dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
 	rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
 else
@@ -1011,7 +998,7 @@ endif
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s
-	m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
+	$(M4_AIX) dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
 	rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
 else
@@ -1021,7 +1008,7 @@ endif
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s
-	m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
+	$(M4_AIX) dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
 	rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
 else
@@ -1043,7 +1030,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN  $< -o - > ctrmm_kernel_ln.s
-	m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
+	$(M4_AIX) ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
 	rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
 else
@@ -1053,7 +1040,7 @@ endif
 $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s
-	m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
+	$(M4_AIX) ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
 	rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
 else
@@ -1063,7 +1050,7 @@ endif
 $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s
-	m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
+	$(M4_AIX) ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN  ctrmm_kernel_lr_nomacros.s -o $@
 	rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
 else
@@ -1073,7 +1060,7 @@ endif
 $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s
-	m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
+	$(M4_AIX) ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
 	rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
 else
@@ -1083,7 +1070,7 @@ endif
 $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s
-	m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
+	$(M4_AIX) ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
 	rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
 else
@@ -1093,7 +1080,7 @@ endif
 $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s
-	m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
+	$(M4_AIX) ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
 	rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
 else
@@ -1103,7 +1090,7 @@ endif
 $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s
-	m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
+	$(M4_AIX) ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
 	rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
 else
@@ -1113,7 +1100,7 @@ endif
 $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s
-	m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
+	$(M4_AIX) ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
 	rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
 else
@@ -1123,7 +1110,7 @@ endif
 $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s
-	m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
+	$(M4_AIX) ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
 	rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1135,7 +1122,7 @@ endif
 $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s
-	m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
+	$(M4_AIX) ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
 	rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1147,7 +1134,7 @@ endif
 $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s
-	m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
+	$(M4_AIX) ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
 	rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1159,7 +1146,7 @@ endif
 $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s
-	m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
+	$(M4_AIX) ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
 	rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s 
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1171,7 +1158,7 @@ endif
 $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s
-	m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
+	$(M4_AIX) ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
 	rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1183,7 +1170,7 @@ endif
 $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s
-	m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
+	$(M4_AIX) ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
 	rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1195,7 +1182,7 @@ endif
 $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s
-	m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
+	$(M4_AIX) ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
 	rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1207,7 +1194,7 @@ endif
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s
-	m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
+	$(M4_AIX) ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
 	rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
 else ifeq ($(CORE), SANDYBRIDGE)
@@ -1229,7 +1216,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
+	$(M4_AIX) strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
 else
@@ -1389,7 +1376,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
 $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s
-	m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
+	$(M4_AIX) dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
 	rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
 else
@@ -2981,7 +2968,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
-	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
+	$(M4_AIX) cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
 	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
 	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s 
 else
@@ -3027,7 +3014,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s	
-	m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
+	$(M4_AIX) strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
 else
diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c
index fcff09337..8cc189fe3 100644
--- a/kernel/arm/nrm2.c
+++ b/kernel/arm/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 
 	n *= inc_x;
diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c
index fc1c8b54a..28bb0eda5 100644
--- a/kernel/arm/znrm2.c
+++ b/kernel/arm/znrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 
 	inc_x2 = 2 * inc_x;
 
diff --git a/kernel/arm/zscal.c b/kernel/arm/zscal.c
index 0521aaa0b..b2d537d04 100644
--- a/kernel/arm/zscal.c
+++ b/kernel/arm/zscal.c
@@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 			else
 			{
 				temp = - da_i * x[ip+1] ;
+				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				x[ip+1] = da_i * x[ip]  ;
 			}
 		}
diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL
index f936cdf47..7d7e648c4 100644
--- a/kernel/arm64/KERNEL
+++ b/kernel/arm64/KERNEL
@@ -1,3 +1,5 @@
+CSUMKERNEL=csum.S
+
 ifndef SNRM2KERNEL
 SNRM2KERNEL = ../arm/nrm2.c
 endif
diff --git a/kernel/arm64/KERNEL.A64FX b/kernel/arm64/KERNEL.A64FX
index bd25f7cd8..bc5999097 100644
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@@ -1,216 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE
index bd25f7cd8..eeb4844bf 100644
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -1,3 +1,6 @@
+
+CSUMKERNEL    = csum_thunderx2t99.c
+ZSUMKERNEL    = zsum_thunderx2t99.c
 SAMINKERNEL  = ../arm/amin.c
 DAMINKERNEL  = ../arm/amin.c
 CAMINKERNEL  = ../arm/zamin.c
@@ -57,7 +60,7 @@ CAMAXKERNEL  = zamax.S
 ZAMAXKERNEL  = zamax.S
 
 SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
+DAXPYKERNEL  = daxpy_thunderx2t99.S
 CAXPYKERNEL  = zaxpy.S
 ZAXPYKERNEL  = zaxpy.S
 
@@ -81,45 +84,35 @@ DGEMVTKERNEL = gemv_t.S
 CGEMVTKERNEL = zgemv_t.S
 ZGEMVTKERNEL = zgemv_t.S
 
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
+SASUMKERNEL    = sasum_thunderx2t99.c
+DASUMKERNEL    = dasum_thunderx2t99.c
+CASUMKERNEL    = casum_thunderx2t99.c
+ZASUMKERNEL    = zasum_thunderx2t99.c
+
+SCOPYKERNEL    = copy_thunderx2t99.c
+DCOPYKERNEL    = copy_thunderx2t99.c
+CCOPYKERNEL    = copy_thunderx2t99.c
+ZCOPYKERNEL    = copy_thunderx2t99.c
+
+SSWAPKERNEL    = swap_thunderx2t99.S
+DSWAPKERNEL    = swap_thunderx2t99.S
+CSWAPKERNEL    = swap_thunderx2t99.S
+ZSWAPKERNEL    = swap_thunderx2t99.S
+
+ISAMAXKERNEL   = iamax_thunderx2t99.c
+IDAMAXKERNEL   = iamax_thunderx2t99.c
+ICAMAXKERNEL   = izamax_thunderx2t99.c
+IZAMAXKERNEL   = izamax_thunderx2t99.c
+
+SNRM2KERNEL    = scnrm2_thunderx2t99.c
+DNRM2KERNEL    = dznrm2_thunderx2t99.c
+CNRM2KERNEL    = scnrm2_thunderx2t99.c
+ZNRM2KERNEL    = dznrm2_thunderx2t99.c
+
+DDOTKERNEL     = dot.c
+SDOTKERNEL     = dot.c
+CDOTKERNEL     = zdot_thunderx2t99.c
+ZDOTKERNEL     = zdot_thunderx2t99.c
 DSDOTKERNEL    = dot.S
 
 DGEMM_BETA     = dgemm_beta.S
@@ -128,10 +121,10 @@ SGEMM_BETA     = sgemm_beta.S
 SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
 
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
+SGEMMINCOPY    =  gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
+SGEMMITCOPY    =  gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
 
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
@@ -149,8 +142,8 @@ SSYMMLCOPY_M    =  symm_lcopy_sve.c
 DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
 DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
 
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
+DGEMMINCOPY    =  gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
+DGEMMITCOPY    =  gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
 DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
 DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
 
@@ -170,8 +163,8 @@ DSYMMLCOPY_M    =  symm_lcopy_sve.c
 CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
+CGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
+CGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 
@@ -194,8 +187,8 @@ CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
 ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
 
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
+ZGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
+ZGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 
diff --git a/kernel/arm64/KERNEL.CORTEXA510 b/kernel/arm64/KERNEL.CORTEXA510
index bd25f7cd8..bc5999097 100644
--- a/kernel/arm64/KERNEL.CORTEXA510
+++ b/kernel/arm64/KERNEL.CORTEXA510
@@ -1,216 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/KERNEL.CORTEXA55 b/kernel/arm64/KERNEL.CORTEXA55
index e2e006770..574e98b8c 100644
--- a/kernel/arm64/KERNEL.CORTEXA55
+++ b/kernel/arm64/KERNEL.CORTEXA55
@@ -1,196 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL   = ../generic/dot.c
-else
-SDOTKERNEL   = dot.S
-endif
-DDOTKERNEL   = dot.S
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL   = zdot.S
-ZDOTKERNEL   = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL  = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
-else
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-endif
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_M), 16)
-SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-endif
-ifeq ($(SGEMM_UNROLL_M), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-endif
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+include $(KERNELDIR)/KERNEL.CORTEXA53
diff --git a/kernel/arm64/KERNEL.CORTEXA710 b/kernel/arm64/KERNEL.CORTEXA710
index bd25f7cd8..bc5999097 100644
--- a/kernel/arm64/KERNEL.CORTEXA710
+++ b/kernel/arm64/KERNEL.CORTEXA710
@@ -1,216 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/KERNEL.CORTEXX2 b/kernel/arm64/KERNEL.CORTEXX2
index bd25f7cd8..bc5999097 100644
--- a/kernel/arm64/KERNEL.CORTEXX2
+++ b/kernel/arm64/KERNEL.CORTEXX2
@@ -1,216 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-DDOTKERNEL     = dot.S
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL     = ../generic/dot.c
-else
-SDOTKERNEL = dot.S
-endif
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL     = zdot.S
-ZDOTKERNEL     = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  sgemm_ncopy_sve_v1.c
-SGEMMITCOPY    =  sgemm_tcopy_sve_v1.c
-SGEMMONCOPY    =  sgemm_ncopy_$(DGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
-DGEMMITCOPY    =  dgemm_tcopy_sve_v1.c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  cgemm_ncopy_sve_v1.c
-CGEMMITCOPY    =  cgemm_tcopy_sve_v1.c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  zgemm_ncopy_sve_v1.c
-ZGEMMITCOPY    =  zgemm_tcopy_sve_v1.c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1
index 9a5938459..bc5999097 100644
--- a/kernel/arm64/KERNEL.NEOVERSEV1
+++ b/kernel/arm64/KERNEL.NEOVERSEV1
@@ -1,189 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_M), 16)
-SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-endif
-ifeq ($(SGEMM_UNROLL_M), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-endif
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ifeq ($(SGEMM_UNROLL_N), 16)
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-else
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
-endif
-ifeq ($(SGEMM_UNROLL_N), 4)
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-else
-SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
-endif
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+include $(KERNELDIR)/KERNEL.ARMV8SVE
diff --git a/kernel/arm64/KERNEL.THUNDERX3T110 b/kernel/arm64/KERNEL.THUNDERX3T110
index 41cedc851..5d3bd69f7 100644
--- a/kernel/arm64/KERNEL.THUNDERX3T110
+++ b/kernel/arm64/KERNEL.THUNDERX3T110
@@ -1,184 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-#DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-#ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
-DGEMMKERNEL    = dgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
-SGEMMKERNEL    =  sgemm_kernel_16x4_thunderx2t99.S
-endif
-
-ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
-CGEMMKERNEL    =  cgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
-ZGEMMKERNEL    =  zgemm_kernel_4x4_thunderx2t99.S
-endif
+include $(KERNELDIR)/KERNEL.THUNDERX2T99
diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S
index 24e08a646..f100adc7a 100644
--- a/kernel/arm64/cgemm_kernel_8x4.S
+++ b/kernel/arm64/cgemm_kernel_8x4.S
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		w17
-#define alphaI		w18
+#define alphaI		w19
 
 #define alpha0_R	s10
 #define alphaV0_R	v10.s[0]
diff --git a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
index 29a68ff22..2c63925be 100644
--- a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
+++ b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
@@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		w17
-#define alphaI		w18
+#define alphaI		w19
 
 #define alpha0_R	s10
 #define alphaV0_R	v10.s[0]
diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S
index 38770f66b..2136ebbee 100644
--- a/kernel/arm64/cgemm_kernel_sve_v1x4.S
+++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S
@@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
     add pB, pB, 32
 
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M1
@@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ld1rw  z15.s, p0/z,  [pB, 28]
 
     add pB, pB, 32
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M2
@@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ri	z23.s, p1/m, z2.s, z15.s
     ld1rw  z15.s, p0/z,  [pB, 28]
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
     add pB, pB, 32
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_E
@@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ii	z22.s, p1/m, z3.s, z15.s
 	OP_ri	z23.s, p1/m, z2.s, z15.s
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-
 .endm
 
 .macro KERNELv1x4_SUB
@@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ii	z22.s, p1/m, z1.s, z15.s
 	OP_ri	z23.s, p1/m, z0.s, z15.s
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 .endm
 
 .macro SAVEv1x4
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
 	fmla	z24.s, p1/m, z16.s, alphaz_R
 	fmls	z24.s, p1/m, z17.s, alphaz_I
@@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2w 	{z26.s, z27.s}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #3
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
-
 	ld2w	{z28.s, z29.s}, p1/z, [pCRow2]
 	fmla	z28.s, p1/m, z20.s, alphaz_R
 	fmls	z28.s, p1/m, z21.s, alphaz_I
@@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmla	z31.s, p1/m, z23.s, alphaz_R
 	st2w 	{z30.s, z31.s}, p1, [pCRow3]
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 	add	pCRow3, pCRow3, lanes, lsl #3	// pC = pC + lanes  * 2 *4
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x2
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
 	fmla	z24.s, p1/m, z16.s, alphaz_R
 	fmls	z24.s, p1/m, z17.s, alphaz_I
@@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2w 	{z26.s, z27.s}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #3
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
-
-	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x1
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2w	{z24.s, z25.s}, p1/z, [pCRow0]
 	fmla	z24.s, p1/m, z16.s, alphaz_R
 	fmls	z24.s, p1/m, z17.s, alphaz_I
@@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	add	pCRow0, pCRow0, lanes, lsl #3	// pC = pC + lanes  * 2 *4
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 
-	prfm	PLDL1KEEP, [origPB]
-	prfm	PLDL1KEEP, [origPA]
-
 	fmov	alphaR, s0
 	dup	    alphaz_R, alphaR
 	fmov	alphaI, s1
@@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	bne	.Lcgemm_kernel_L4_Mv1_46
 
 .Lcgemm_kernel_L4_Mv1_100:
-	prfm	PLDL1KEEP, [pA]
-	prfm	PLDL1KEEP, [pA, #64]
-	prfm	PLDL1KEEP, [origPB]
-
 	SAVEv1x4
 
 .Lcgemm_kernel_L4_Mv1_END:
diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c
index 6aa44a8f6..2fdaf5fcd 100644
--- a/kernel/arm64/cgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/cgemm_ncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
+    svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
     uint32_t active = svcntp_b32(svptrue_b32(), pg);
     do {
 
@@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * lda * 2;
 
         j += svcntw();
-        pg = svwhilelt_b32(j, n);
+        pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
         active = svcntp_b32(svptrue_b32(), pg);
 
 
diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c
index 748cd954e..086a2fed1 100644
--- a/kernel/arm64/cgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/cgemm_tcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
+    svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
     uint32_t active = svcntp_b32(svptrue_b32(), pg);
     do {
 
@@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * 2;
 
         j += svcntw();
-        pg = svwhilelt_b32(j, n);
+        pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
         active = svcntp_b32(svptrue_b32(), pg);
 
     } while (svptest_any(svptrue_b32(), pg));
diff --git a/kernel/arm64/csum_thunderx2t99.c b/kernel/arm64/csum_thunderx2t99.c
new file mode 100644
index 000000000..874f4eb5a
--- /dev/null
+++ b/kernel/arm64/csum_thunderx2t99.c
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#include <arm_neon.h>
+
+#define	N	"x0"	/* vector length */
+#define	X	"x1"	/* "X" vector address */
+#define	INC_X	"x2"	/* "X" stride */
+#define J	"x5"	/* loop variable */
+
+#define REG0	"wzr"
+#define SUMF	"s0"
+#define SUMFD	"d0"
+
+/******************************************************************************/
+
+#define KERNEL_F1					\
+	"ldr	d1, ["X"]			\n"	\
+	"add	"X", "X", #8			\n"	\
+	"ext	v2.8b, v1.8b, v1.8b, #4		\n"	\
+	"fadd	s1, s1, s2			\n"	\
+	"fadd	"SUMF", "SUMF", s1		\n"
+
+#define KERNEL_F32					\
+	"ldr	q16, ["X"]			\n"	\
+	"ldr	q17, ["X", #16]			\n"	\
+	"ldr	q18, ["X", #32]			\n"	\
+	"ldr	q19, ["X", #48]			\n"	\
+	"ldp	q20, q21, ["X", #64]		\n"	\
+	"ldp	q22, q23, ["X", #96]		\n"	\
+	"ldp	q24, q25, ["X", #128]		\n"	\
+	"ldp	q26, q27, ["X", #160]		\n"	\
+	"fadd	v16.4s, v16.4s, v17.4s		\n"	\
+	"fadd	v18.4s, v18.4s, v19.4s		\n"	\
+	"ldp	q28, q29, ["X", #192]		\n"	\
+	"ldp	q30, q31, ["X", #224]		\n"	\
+	"add	"X", "X", #256			\n"	\
+	"fadd	v20.4s, v20.4s, v21.4s		\n"	\
+	"fadd	v22.4s, v22.4s, v23.4s		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024]		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+64]	\n"	\
+	"fadd	v24.4s, v24.4s, v25.4s		\n"	\
+	"fadd	v26.4s, v26.4s, v27.4s		\n"	\
+	"fadd	v0.4s, v0.4s, v16.4s		\n"	\
+	"fadd	v1.4s, v1.4s, v18.4s		\n"	\
+	"fadd	v2.4s, v2.4s, v20.4s		\n"	\
+	"fadd	v3.4s, v3.4s, v22.4s		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+128]	\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+192]	\n"	\
+	"fadd	v28.4s, v28.4s, v29.4s		\n"	\
+	"fadd	v30.4s, v30.4s, v31.4s		\n"	\
+	"fadd	v4.4s, v4.4s, v24.4s		\n"	\
+	"fadd	v5.4s, v5.4s, v26.4s		\n"	\
+	"fadd	v6.4s, v6.4s, v28.4s		\n"	\
+	"fadd	v7.4s, v7.4s, v30.4s		\n"
+
+#define KERNEL_F32_FINALIZE				\
+	"fadd	v0.4s, v0.4s, v1.4s		\n"	\
+	"fadd	v2.4s, v2.4s, v3.4s		\n"	\
+	"fadd	v4.4s, v4.4s, v5.4s		\n"	\
+	"fadd	v6.4s, v6.4s, v7.4s		\n"	\
+	"fadd	v0.4s, v0.4s, v2.4s		\n"	\
+	"fadd	v4.4s, v4.4s, v6.4s		\n"	\
+	"fadd	v0.4s, v0.4s, v4.4s		\n"	\
+	"ext	v1.16b, v0.16b, v0.16b, #8	\n"	\
+	"fadd	v0.2s, v0.2s, v1.2s		\n"	\
+	"faddp	"SUMF", v0.2s			\n"
+
+#define INIT_S						\
+	"lsl	"INC_X", "INC_X", #3		\n"
+
+#define KERNEL_S1					\
+	"ldr	d1, ["X"]			\n"	\
+	"add	"X", "X", "INC_X"		\n"	\
+	"ext	v2.8b, v1.8b, v1.8b, #4		\n"	\
+	"fadd	s1, s1, s2			\n"	\
+	"fadd	"SUMF", "SUMF", s1		\n"
+
+
+#if defined(SMP)
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
+	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
+	void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+
+
+static FLOAT casum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	FLOAT  asum = 0.0 ;
+
+	if ( n < 0 )  return(asum);
+
+	__asm__ __volatile__ (
+	"	mov	"N", %[N_]			\n"
+	"	mov	"X", %[X_]			\n"
+	"	mov	"INC_X", %[INCX_]		\n"
+	"	fmov	"SUMF", "REG0"			\n"
+	"	fmov	s1, "REG0"			\n"
+	"	fmov	s2, "REG0"			\n"
+	"	fmov	s3, "REG0"			\n"
+	"	fmov	s4, "REG0"			\n"
+	"	fmov	s5, "REG0"			\n"
+	"	fmov	s6, "REG0"			\n"
+	"	fmov	s7, "REG0"			\n"
+	"	cmp	"N", xzr			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+	"	cmp	"INC_X", xzr			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+	"	cmp	"INC_X", #1			\n"
+	"	bne	5f //asum_kernel_S_BEGIN	\n"
+
+	"1: //asum_kernel_F_BEGIN:			\n"
+	"	asr	"J", "N", #5			\n"
+	"	cmp	"J", xzr			\n"
+	"	beq	3f //asum_kernel_F1		\n"
+
+	"2: //asum_kernel_F32:				\n"
+	"	"KERNEL_F32"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	2b //asum_kernel_F32		\n"
+	"	"KERNEL_F32_FINALIZE"			\n"
+
+	"3: //asum_kernel_F1:				\n"
+	"	ands	"J", "N", #31			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+
+	"4: //asum_kernel_F10:				\n"
+	"	"KERNEL_F1"				\n"
+	"	subs    "J", "J", #1			\n"
+	"	bne	4b //asum_kernel_F10		\n"
+	"	b	9f //asum_kernel_L999		\n"
+
+	"5: //asum_kernel_S_BEGIN:			\n"
+	"	"INIT_S"				\n"
+	"	asr	"J", "N", #2			\n"
+	"	cmp	"J", xzr			\n"
+	"	ble	7f //asum_kernel_S1		\n"
+
+	"6: //asum_kernel_S4:				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	6b //asum_kernel_S4		\n"
+
+	"7: //asum_kernel_S1:				\n"
+	"	ands	"J", "N", #3			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+
+	"8: //asum_kernel_S10:				\n"
+	"	"KERNEL_S1"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	8b //asum_kernel_S10		\n"
+
+	"9: //asum_kernel_L999:				\n"
+	"	fmov	%[ASUM_], "SUMFD"		\n"
+
+	: [ASUM_] "=r" (asum)		//%0
+	: [N_]    "r"  (n),		//%1
+	  [X_]    "r"  (x),		//%2
+	  [INCX_] "r"  (inc_x)		//%3
+	: "cc",
+	  "memory",
+	  "x0", "x1", "x2", "x3", "x4", "x5",
+	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
+	);
+
+	return asum;
+}
+
+#if defined(SMP)
+static int casum_thread_function(BLASLONG n, BLASLONG dummy0,
+	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
+{
+	*result = casum_compute(n, x, inc_x);
+
+	return 0;
+}
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+	int nthreads;
+	FLOAT dummy_alpha;
+#endif
+	FLOAT asum = 0.0;
+
+#if defined(SMP)
+	if (inc_x == 0 || n <= 10000)
+		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
+
+	if (nthreads == 1) {
+		asum = casum_compute(n, x, inc_x);
+	} else {
+		int mode, i;
+		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
+		FLOAT *ptr;
+
+		mode = BLAS_SINGLE  | BLAS_COMPLEX;
+
+		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
+				   x, inc_x, NULL, 0, result, 0,
+				   ( void *)casum_thread_function, nthreads);
+
+		ptr = (FLOAT *)result;
+		for (i = 0; i < nthreads; i++) {
+			asum = asum + (*ptr);
+			ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
+		}
+	}
+#else
+	asum = casum_compute(n, x, inc_x);
+#endif
+
+	return asum;
+}
diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S
index 5c0827397..e8f1d8cf3 100644
--- a/kernel/arm64/ctrmm_kernel_8x4.S
+++ b/kernel/arm64/ctrmm_kernel_8x4.S
@@ -49,10 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define pCRow3		x15
 #define pA		x16
 #define alphaR		w17
-#define alphaI		w18
-#define temp		x19
-#define tempOffset	x20
-#define tempK		x21
+#define alphaI		w19
+#define temp		x20
+#define tempOffset	x21
+#define tempK		x22
 
 #define alpha0_R	s10
 #define alphaV0_R	v10.s[0]
diff --git a/kernel/arm64/dgemm_ncopy_sve_v1.c b/kernel/arm64/dgemm_ncopy_sve_v1.c
deleted file mode 100644
index 1f812c775..000000000
--- a/kernel/arm64/dgemm_ncopy_sve_v1.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    svint64_t lda_vec = svindex_s64(0LL, lda);
-    uint64_t sve_size = svcntd();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
-    uint64_t active = svcntp_b64(svptrue_b64(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint64_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
-            svst1_f64(pg, (double *) boffset, a_vec);
-            aoffset1++;
-            boffset += active;
-        }
-        aoffset += sve_size * lda;
-
-        j += svcntd();
-        pg = svwhilelt_b64(j, n);
-        active = svcntp_b64(svptrue_b64(), pg);
-
-
-    } while (svptest_any(svptrue_b64(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/dgemm_tcopy_sve_v1.c b/kernel/arm64/dgemm_tcopy_sve_v1.c
deleted file mode 100644
index cb645a1b6..000000000
--- a/kernel/arm64/dgemm_tcopy_sve_v1.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    uint64_t sve_size = svcntd();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
-    uint64_t active = svcntp_b64(svptrue_b64(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint64_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
-            svst1_f64(pg, (double *) boffset, a_vec);
-            aoffset1 += lda;
-            boffset += active;
-        }
-        aoffset += sve_size;
-
-        j += svcntd();
-        pg = svwhilelt_b64(j, n);
-        active = svcntp_b64(svptrue_b64(), pg);
-
-    } while (svptest_any(svptrue_b64(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c
index 8460e0d5e..16f4cd537 100644
--- a/kernel/arm64/dot_kernel_sve.c
+++ b/kernel/arm64/dot_kernel_sve.c
@@ -1,4 +1,5 @@
 /***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
 Copyright (c) 2022, Arm Ltd
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
@@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <arm_sve.h>
 
 #ifdef DOUBLE
-#define SVE_TYPE svfloat64_t
-#define SVE_ZERO svdup_f64(0.0)
-#define SVE_WHILELT svwhilelt_b64
-#define SVE_ALL svptrue_b64()
-#define SVE_WIDTH svcntd()
+#define DTYPE "d"
+#define WIDTH "d"
+#define SHIFT "3"
 #else
-#define SVE_TYPE svfloat32_t
-#define SVE_ZERO svdup_f32(0.0)
-#define SVE_WHILELT svwhilelt_b32
-#define SVE_ALL svptrue_b32()
-#define SVE_WIDTH svcntw()
+#define DTYPE "s"
+#define WIDTH "w"
+#define SHIFT "2"
 #endif
 
-static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
-        SVE_TYPE acc_a = SVE_ZERO;
-        SVE_TYPE acc_b = SVE_ZERO;
+#define COUNT \
+"        cnt"WIDTH"    x9                                   \n"
+#define SETUP_TRUE \
+"        ptrue   p0."DTYPE"                              \n"
+#define OFFSET_INPUTS                                     \
+"        add     x12, %[X_], x9, lsl #"SHIFT"               \n" \
+"        add     x13, %[Y_], x9, lsl #"SHIFT"               \n"
+#define TAIL_WHILE                                        \
+"        whilelo p1."DTYPE", x8, x0                         \n"
+#define UPDATE(pg, x,y,out)                               \
+"        ld1"WIDTH"    { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"]  \n" \
+"        ld1"WIDTH"    { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"]  \n" \
+"        fmla    "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE"      \n"
+#define SUM_VECTOR(v) \
+"        faddv   "DTYPE""v", p0, z"v"."DTYPE"                     \n"
+#define RET \
+"        fadd    %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0                     \n"
 
-        BLASLONG sve_width = SVE_WIDTH;
+#define DOT_KERNEL                                        \
+        COUNT                                             \
+"        mov     z1.d, #0                             \n" \
+"        mov     z0.d, #0                             \n" \
+"        mov     x8, #0                               \n" \
+"        movi    d1, #0x0                             \n" \
+        SETUP_TRUE                                        \
+"        neg     x10, x9, lsl #1                      \n" \
+"        ands    x11, x10, x0                         \n" \
+"        b.eq    2f // skip_2x                        \n" \
+        OFFSET_INPUTS                                     \
+"1: // vector_2x                                      \n" \
+        UPDATE("p0", "%[X_]", "%[Y_]", "z1") \
+        UPDATE("p0", "x12", "x13", "z0") \
+"        sub     x8, x8, x10                          \n" \
+"        cmp     x8, x11                              \n" \
+"        b.lo    1b // vector_2x                      \n" \
+        SUM_VECTOR("1") \
+"2: // skip_2x                                        \n" \
+"        neg     x10, x9                              \n" \
+"        and     x10, x10, x0                         \n" \
+"        cmp     x8, x10                              \n" \
+"        b.hs    4f // tail                           \n" \
+"3: // vector_1x                                      \n" \
+        UPDATE("p0", "%[X_]", "%[Y_]", "z0")              \
+"        add     x8, x8, x9                           \n" \
+"        cmp     x8, x10                              \n" \
+"        b.lo    3b // vector_1x                      \n" \
+"4: // tail                                           \n" \
+"        cmp     x10, x0                              \n" \
+"        b.eq    5f // end                            \n" \
+        TAIL_WHILE                                        \
+        UPDATE("p1", "%[X_]", "%[Y_]", "z0")              \
+"5: // end                                            \n" \
+        SUM_VECTOR("0") \
+        RET
 
-        for (BLASLONG i = 0; i < n; i += sve_width * 2) {
-                svbool_t pg_a = SVE_WHILELT(i, n);
-                svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
+static
+FLOAT
+dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y)
+{
+  FLOAT ret;
 
-                SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
-                SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
-                SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
-                SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
+  asm(DOT_KERNEL
+      :
+        [RET_] "=&w" (ret)
+      :
+        [N_] "r" (n),
+        [X_] "r" (x),
+        [Y_] "r" (y)
+      :);
 
-                acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a);
-                acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b);
-        }
-
-        return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b);
+  return ret;
 }
diff --git a/kernel/arm64/dznrm2_thunderx2t99.c b/kernel/arm64/dznrm2_thunderx2t99.c
index e342b0b63..6077c85dd 100644
--- a/kernel/arm64/dznrm2_thunderx2t99.c
+++ b/kernel/arm64/dznrm2_thunderx2t99.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "common.h"
-
+#include <float.h>
 #include <arm_neon.h>
 
 #if defined(SMP)
@@ -404,7 +404,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 #else
 	nrm2_compute(n, x, inc_x, &ssq, &scale);
 #endif
-	if (fabs(scale) <1.e-300) return 0.;
+	volatile FLOAT sca = fabs(scale);
+	if (sca < DBL_MIN) return 0.;
 	ssq = sqrt(ssq) * scale;
 
 	return ssq;
diff --git a/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c
new file mode 100644
index 000000000..90f867b44
--- /dev/null
+++ b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64_t
+#define SV_INDEX svuint64_t
+#define SV_INDEXER svindex_u64
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32_t
+#define SV_INDEX svuint32_t
+#define SV_INDEXER svindex_u32
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)           \
+    a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec);       \
+    a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec);   \
+    svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag));             \
+    a_offset_inner += 2;                                                \
+    b_offset += active * 2;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+    uint64_t sve_size;
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
+    SV_TYPE a_vec_real;
+    SV_TYPE a_vec_imag;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size * lda * 2;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/arm64/gemm_ncopy_sve_v1x8.c b/kernel/arm64/gemm_ncopy_sve_v1x8.c
new file mode 100644
index 000000000..7b2a2e767
--- /dev/null
+++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64_t
+#define SV_INDEX svuint64_t
+#define SV_INDEXER svindex_u64
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#define SV_PREFETCH svprfd_gather_index
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32_t
+#define SV_INDEX svuint32_t
+#define SV_INDEXER svindex_u32
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#define SV_PREFETCH svprfw_gather_index
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)   \
+    a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec);    \
+    svst1(pg, b_offset, a_vec);                                 \
+    a_offset_inner++;                                           \
+    b_offset += active;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
+    uint64_t sve_size;
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_INDEX lda_vec = SV_INDEXER(0LL, lda);
+    SV_TYPE a_vec;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 3;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 4) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size * lda;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c
new file mode 100644
index 000000000..975166a2e
--- /dev/null
+++ b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64x2_t
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32x2_t
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)   \
+    a_vec = svld2(pg, a_offset_inner);                          \
+    svst2(pg, b_offset, a_vec);                                 \
+    a_offset_inner += lda * 2;                                  \
+    b_offset += active * 2;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+    uint64_t sve_size = svcntw();
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_TYPE a_vec;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size * 2;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
+
+
diff --git a/kernel/arm64/gemm_tcopy_sve_v1x8.c b/kernel/arm64/gemm_tcopy_sve_v1x8.c
new file mode 100644
index 000000000..9a93b6cb7
--- /dev/null
+++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c
@@ -0,0 +1,125 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdint.h>
+#include <stdio.h>
+#include <arm_sve.h>
+
+#include "common.h"
+
+#ifdef DOUBLE
+#define COUNT "cntd"
+#define SV_TYPE svfloat64_t
+#define SV_TRUE svptrue_b64
+#define SV_WHILE svwhilelt_b64
+#else
+#define COUNT "cntw"
+#define SV_TYPE svfloat32_t
+#define SV_TRUE svptrue_b32
+#define SV_WHILE svwhilelt_b32
+#endif
+
+#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active)   \
+    a_vec = svld1(pg, a_offset_inner);                          \
+    svst1(pg, b_offset, a_vec);                                 \
+    a_offset_inner += lda;                                      \
+    b_offset += active;
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+    uint64_t sve_size = svcntw();
+    asm(COUNT" %[SIZE_]" : [SIZE_]  "=r" (sve_size) : : );
+
+    IFLOAT *a_offset, *a_offset_inner, *b_offset;
+    a_offset = a;
+    b_offset = b;
+
+    SV_TYPE a_vec;
+    svbool_t pg_true = SV_TRUE();
+
+    BLASLONG single_vectors_n = n & -sve_size;
+    for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
+        a_offset_inner = a_offset;
+
+        svbool_t pg = pg_true;
+        uint64_t active = sve_size;
+        uint64_t i_cnt = m >> 3;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 4) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        a_offset += sve_size;
+    }
+
+    BLASLONG remaining_n = n - single_vectors_n;
+    if (remaining_n) {
+        a_offset_inner = a_offset;
+        svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
+        uint64_t active = remaining_n;
+        uint64_t i_cnt = m >> 2;
+        while (i_cnt--) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 2) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+
+        if (m & 1) {
+            INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/arm64/sgemm_beta.S b/kernel/arm64/sgemm_beta.S
old mode 100755
new mode 100644
diff --git a/kernel/arm64/sgemm_ncopy_sve_v1.c b/kernel/arm64/sgemm_ncopy_sve_v1.c
deleted file mode 100644
index 1bc186335..000000000
--- a/kernel/arm64/sgemm_ncopy_sve_v1.c
+++ /dev/null
@@ -1,78 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    svint32_t lda_vec = svindex_s32(0LL, lda);
-    uint32_t sve_size = svcntw();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
-    uint32_t active = svcntp_b32(svptrue_b32(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint32_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat32_t a_vec = svld1_gather_index(pg, (float *) aoffset1, lda_vec);
-            svst1_f32(pg, (float *) boffset, a_vec);
-            aoffset1++;
-            boffset += active;
-        }
-        aoffset += sve_size * lda;
-
-        j += svcntw();
-        pg = svwhilelt_b32(j, n);
-        active = svcntp_b32(svptrue_b32(), pg);
-
-    } while (svptest_any(svptrue_b32(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/sgemm_tcopy_sve_v1.c b/kernel/arm64/sgemm_tcopy_sve_v1.c
deleted file mode 100644
index 9f8cf502a..000000000
--- a/kernel/arm64/sgemm_tcopy_sve_v1.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include <stdio.h>
-#include "common.h"
-#include <arm_sve.h>
-
-// TODO: write in assembly with proper unrolling of inner loop
-int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
-
-    BLASLONG j;
-    IFLOAT *aoffset, *aoffset1, *boffset;
-
-    uint32_t sve_size = svcntw();
-
-    aoffset = a;
-    boffset = b;
-
-    j = 0;
-    svbool_t pg = svwhilelt_b32(j, n);
-    uint32_t active = svcntp_b32(svptrue_b32(), pg);
-    do {
-
-        aoffset1 = aoffset;
-
-        uint32_t i_cnt = m;
-        while (i_cnt--) {
-            svfloat32_t a_vec = svld1(pg, (float *) aoffset1);
-            svst1_f32(pg, (float *) boffset, a_vec);
-            aoffset1 += lda;
-            boffset += active;
-        }
-        aoffset += sve_size;
-
-        j += svcntw();
-        pg = svwhilelt_b32(j, n);
-        active = svcntp_b32(svptrue_b32(), pg);
-
-    } while (svptest_any(svptrue_b32(), pg));
-
-    return 0;
-}
diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c
index 6ba4afc8b..e138f0647 100644
--- a/kernel/arm64/symm_lcopy_sve.c
+++ b/kernel/arm64/symm_lcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c
index 32da5bd16..9a4cb6d4f 100644
--- a/kernel/arm64/symm_ucopy_sve.c
+++ b/kernel/arm64/symm_ucopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c
index 918e945ac..c7f79e3fd 100644
--- a/kernel/arm64/trmm_lncopy_sve_v1.c
+++ b/kernel/arm64/trmm_lncopy_sve_v1.c
@@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c
index b76cc56de..b3ba68973 100644
--- a/kernel/arm64/trmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/trmm_ltcopy_sve_v1.c
@@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c
index 75fa163ae..a47d2096c 100644
--- a/kernel/arm64/trmm_uncopy_sve_v1.c
+++ b/kernel/arm64/trmm_uncopy_sve_v1.c
@@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c
index 36a03242a..c5188beb4 100644
--- a/kernel/arm64/trmm_utcopy_sve_v1.c
+++ b/kernel/arm64/trmm_utcopy_sve_v1.c
@@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c
index 5a9d4194a..2895eb85d 100644
--- a/kernel/arm64/trsm_lncopy_sve.c
+++ b/kernel/arm64/trsm_lncopy_sve.c
@@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c
index ac4019e26..fdda992e0 100644
--- a/kernel/arm64/trsm_ltcopy_sve.c
+++ b/kernel/arm64/trsm_ltcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c
index 8fdcd0f4b..1a03aa974 100644
--- a/kernel/arm64/trsm_uncopy_sve.c
+++ b/kernel/arm64/trsm_uncopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c
index 0f5f0dccd..b06166f36 100644
--- a/kernel/arm64/trsm_utcopy_sve.c
+++ b/kernel/arm64/trsm_utcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c
index 728f97fb3..6f65e5cfd 100644
--- a/kernel/arm64/zdot_thunderx2t99.c
+++ b/kernel/arm64/zdot_thunderx2t99.c
@@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#if (NVCOMPVERS < 2309)
+#pragma opt 1
+#endif
+#endif
 
 #include "common.h"
 
diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S
index d5b35775c..a043948d6 100644
--- a/kernel/arm64/zgemm_kernel_sve_v1x4.S
+++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S
@@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ld1rd  z15.d, p0/z,  [pB, 56]
 
     add pB, pB, 64
-
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M1
@@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     ld1rd  z15.d, p0/z,  [pB, 56]
 
     add pB, pB, 64
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_M2
@@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ri	z23.d, p1/m, z2.d, z15.d
     ld1rd  z15.d, p0/z,  [pB, 56]
 
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
     add pB, pB, 64
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
 .endm
 
 .macro KERNELv1x4_E
@@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ir	z23.d, p1/m, z3.d, z14.d
 	OP_ii	z22.d, p1/m, z3.d, z15.d
 	OP_ri	z23.d, p1/m, z2.d, z15.d
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE+64]
-
 .endm
 
 .macro KERNELv1x4_SUB
@@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	OP_ir	z23.d, p1/m, z1.d, z14.d
 	OP_ii	z22.d, p1/m, z1.d, z15.d
 	OP_ri	z23.d, p1/m, z0.d, z15.d
-
-	prfm	PLDL1KEEP, [pB, #B_PRE_SIZE]
-	prfm	PLDL1KEEP, [pA, #A_PRE_SIZE]
 .endm
 
 .macro SAVEv1x4
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
 	fmla	z24.d, p1/m, z16.d, alphaz_R
 	fmls	z24.d, p1/m, z17.d, alphaz_I
@@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2d 	{z26.d, z27.d}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #4
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
 
 	ld2d	{z28.d, z29.d}, p1/z, [pCRow2]
 	fmla	z28.d, p1/m, z20.d, alphaz_R
@@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	fmla	z31.d, p1/m, z23.d, alphaz_R
 	st2d 	{z30.d, z31.d}, p1, [pCRow3]
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 	add	pCRow3, pCRow3, lanes, lsl #4	// pC = pC + lanes  * 2 *8
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x2
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
 	fmla	z24.d, p1/m, z16.d, alphaz_R
 	fmls	z24.d, p1/m, z17.d, alphaz_I
@@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	st2d 	{z26.d, z27.d}, p1, [pCRow1]
 
 	add	pCRow1, pCRow1, lanes, lsl #4
-	prfm	PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
-
-	prfm	PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endm
 
 .macro SAVEv1x1
-	prfm	PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
-
 	ld2d	{z24.d, z25.d}, p1/z, [pCRow0]
 	fmla	z24.d, p1/m, z16.d, alphaz_R
 	fmls	z24.d, p1/m, z17.d, alphaz_I
@@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	add	pCRow0, pCRow0, lanes, lsl #4	// pC = pC + lanes  * 2 *8
 
-	prfm	PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
-
 .endm
 
 /******************************************************************************/
@@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 
-	prfm	PLDL1KEEP, [origPB]
-	prfm	PLDL1KEEP, [origPA]
-
 	fmov	alphaR, d0
 	dup	    alphaz_R, alphaR
 	fmov	alphaI, d1
@@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	bne	.Lzgemm_kernel_L4_Mv1_46
 
 .Lzgemm_kernel_L4_Mv1_100:
-	prfm	PLDL1KEEP, [pA]
-	prfm	PLDL1KEEP, [pA, #64]
-	prfm	PLDL1KEEP, [origPB]
-
 	SAVEv1x4
 
 .Lzgemm_kernel_L4_Mv1_END:
diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c
index 8f9b4268a..6b8c93baf 100644
--- a/kernel/arm64/zgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/zgemm_ncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
+    svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     uint64_t active = svcntp_b64(svptrue_b64(), pg);
     do {
 
@@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * lda * 2;
 
         j += svcntd();
-        pg = svwhilelt_b64(j, n);
+        pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
         active = svcntp_b64(svptrue_b64(), pg);
 
 
diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c
index c6e50bc1c..fd8d2190f 100644
--- a/kernel/arm64/zgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/zgemm_tcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
     boffset = b;
 
     j = 0;
-    svbool_t pg = svwhilelt_b64(j, n);
+    svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     uint64_t active = svcntp_b64(svptrue_b64(), pg);
     do {
 
@@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
         aoffset += active * 2;
 
         j += svcntd();
-        pg = svwhilelt_b64(j, n);
+        pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
         active = svcntp_b64(svptrue_b64(), pg);
 
     } while (svptest_any(svptrue_b64(), pg));
diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c
index 37dbfe4e1..fcf7e7073 100644
--- a/kernel/arm64/zhemm_ltcopy_sve.c
+++ b/kernel/arm64/zhemm_ltcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(cmp, gat_ind, lda_vec);
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t j = 0;
   int32_t N = n;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(cmp, gat_ind, lda_vec);
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32(offset, 0);
+            svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c
index 21e03b7be..056c9824e 100644
--- a/kernel/arm64/zhemm_utcopy_sve.c
+++ b/kernel/arm64/zhemm_utcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
         data_vec_imag = svneg_z(pg, data_vec_imag);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b64(offset, 0LL);
+            svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 #else
@@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t j = 0;
   int32_t N = n;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
         data_vec_imag = svneg_z(pg, data_vec_imag);
         if (offset <= 0) {
-            svbool_t off_g = svwhilelt_b32(offset, 0);
+            svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
             data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
         }
 
@@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S
index 929455975..4bd43320d 100644
--- a/kernel/arm64/zscal.S
+++ b/kernel/arm64/zscal.S
@@ -223,7 +223,7 @@ zscal_begin:
 	fcmp	DA_I, #0.0
 	beq	.Lzscal_kernel_RI_zero
 
-	b	.Lzscal_kernel_R_zero
+//	b	.Lzscal_kernel_R_zero
 
 .Lzscal_kernel_R_non_zero:
 
diff --git a/kernel/arm64/zsum_thunderx2t99.c b/kernel/arm64/zsum_thunderx2t99.c
new file mode 100644
index 000000000..087dae2fe
--- /dev/null
+++ b/kernel/arm64/zsum_thunderx2t99.c
@@ -0,0 +1,244 @@
+/***************************************************************************
+Copyright (c) 2017, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#include <arm_neon.h>
+
+#define	N	"x0"	/* vector length */
+#define	X	"x1"	/* "X" vector address */
+#define	INC_X	"x2"	/* "X" stride */
+#define J	"x5"	/* loop variable */
+
+#define REG0	"xzr"
+#define SUMF	"d0"
+#define TMPF	"d1"
+
+/******************************************************************************/
+
+#define KERNEL_F1					\
+	"ldr	q1, ["X"]			\n"	\
+	"add	"X", "X", #16			\n"	\
+	"faddp	d1, v1.2d			\n"	\
+	"fadd	"SUMF", "SUMF", d1		\n"
+
+#define KERNEL_F16					\
+	"ldr	q16, ["X"]			\n"	\
+	"ldr	q17, ["X", #16]			\n"	\
+	"ldr	q18, ["X", #32]			\n"	\
+	"ldr	q19, ["X", #48]			\n"	\
+	"ldp	q20, q21, ["X", #64]		\n"	\
+	"ldp	q22, q23, ["X", #96]		\n"	\
+	"ldp	q24, q25, ["X", #128]		\n"	\
+	"ldp	q26, q27, ["X", #160]		\n"	\
+	"fadd	v16.2d, v16.2d, v17.2d		\n"	\
+	"fadd	v18.2d, v18.2d, v19.2d		\n"	\
+	"ldp	q28, q29, ["X", #192]		\n"	\
+	"ldp	q30, q31, ["X", #224]		\n"	\
+	"add	"X", "X", #256			\n"	\
+	"fadd	v20.2d, v20.2d, v21.2d		\n"	\
+	"fadd	v22.2d, v22.2d, v23.2d		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024]		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+64]	\n"	\
+	"fadd	v24.2d, v24.2d, v25.2d		\n"	\
+	"fadd	v26.2d, v26.2d, v27.2d		\n"	\
+	"fadd	v28.2d, v28.2d, v29.2d		\n"	\
+	"fadd	v30.2d, v30.2d, v31.2d		\n"	\
+	"fadd	v0.2d, v0.2d, v16.2d		\n"	\
+	"fadd	v1.2d, v1.2d, v18.2d		\n"	\
+	"fadd	v2.2d, v2.2d, v20.2d		\n"	\
+	"fadd	v3.2d, v3.2d, v22.2d		\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+128]	\n"	\
+	"PRFM	PLDL1KEEP, ["X", #1024+192]	\n"	\
+	"fadd	v4.2d, v4.2d, v24.2d		\n"	\
+	"fadd	v5.2d, v5.2d, v26.2d		\n"	\
+	"fadd	v6.2d, v6.2d, v28.2d		\n"	\
+	"fadd	v7.2d, v7.2d, v30.2d		\n"
+
+#define KERNEL_F16_FINALIZE				\
+	"fadd	v0.2d, v0.2d, v1.2d		\n"	\
+	"fadd	v2.2d, v2.2d, v3.2d		\n"	\
+	"fadd	v4.2d, v4.2d, v5.2d		\n"	\
+	"fadd	v6.2d, v6.2d, v7.2d		\n"	\
+	"fadd	v0.2d, v0.2d, v2.2d		\n"	\
+	"fadd	v4.2d, v4.2d, v6.2d		\n"	\
+	"fadd	v0.2d, v0.2d, v4.2d		\n"	\
+	"faddp	"SUMF", v0.2d			\n"
+
+#define INIT_S						\
+	"lsl	"INC_X", "INC_X", #4		\n"
+
+#define KERNEL_S1					\
+	"ldr	q1, ["X"]			\n"	\
+	"add	"X", "X", "INC_X"		\n"	\
+	"faddp	d1, v1.2d			\n"	\
+	"fadd	"SUMF", "SUMF", d1		\n"
+
+
+#if defined(SMP)
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
+	BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
+	void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+
+
+static FLOAT zasum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	FLOAT  asum = 0.0 ;
+
+	if ( n < 0 )  return(asum);
+
+	__asm__ __volatile__ (
+	"	mov	"N", %[N_]			\n"
+	"	mov	"X", %[X_]			\n"
+	"	mov	"INC_X", %[INCX_]		\n"
+	"	fmov	"SUMF", "REG0"			\n"
+	"	fmov	d1, "REG0"			\n"
+	"	fmov	d2, "REG0"			\n"
+	"	fmov	d3, "REG0"			\n"
+	"	fmov	d4, "REG0"			\n"
+	"	fmov	d5, "REG0"			\n"
+	"	fmov	d6, "REG0"			\n"
+	"	fmov	d7, "REG0"			\n"
+	"	cmp	"N", xzr			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+	"	cmp	"INC_X", xzr			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+	"	cmp	"INC_X", #1			\n"
+	"	bne	5f //asum_kernel_S_BEGIN	\n"
+
+	"1: //asum_kernel_F_BEGIN:			\n"
+	"	asr	"J", "N", #4			\n"
+	"	cmp	"J", xzr			\n"
+	"	beq	3f //asum_kernel_F1		\n"
+
+	".align 5					\n"
+	"2: //asum_kernel_F16:				\n"
+	"	"KERNEL_F16"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	2b //asum_kernel_F16		\n"
+	"	"KERNEL_F16_FINALIZE"			\n"
+
+	"3: //asum_kernel_F1:				\n"
+	"	ands	"J", "N", #15			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+
+	"4: //asum_kernel_F10:				\n"
+	"	"KERNEL_F1"				\n"
+	"	subs    "J", "J", #1			\n"
+	"	bne	4b //asum_kernel_F10		\n"
+	"	b	9f //asum_kernel_L999		\n"
+
+	"5: //asum_kernel_S_BEGIN:			\n"
+	"	"INIT_S"				\n"
+	"	asr	"J", "N", #2			\n"
+	"	cmp	"J", xzr			\n"
+	"	ble	7f //asum_kernel_S1		\n"
+
+	"6: //asum_kernel_S4:				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	"KERNEL_S1"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	6b //asum_kernel_S4		\n"
+
+	"7: //asum_kernel_S1:				\n"
+	"	ands	"J", "N", #3			\n"
+	"	ble	9f //asum_kernel_L999		\n"
+
+	"8: //asum_kernel_S10:				\n"
+	"	"KERNEL_S1"				\n"
+	"	subs	"J", "J", #1			\n"
+	"	bne	8b //asum_kernel_S10		\n"
+
+	"9: //asum_kernel_L999:				\n"
+	"	fmov	%[ASUM_], "SUMF"		\n"
+
+	: [ASUM_] "=r" (asum)		//%0
+	: [N_]    "r"  (n),		//%1
+	  [X_]    "r"  (x),		//%2
+	  [INCX_] "r"  (inc_x)		//%3
+	: "cc",
+	  "memory",
+	  "x0", "x1", "x2", "x3", "x4", "x5",
+	  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
+	);
+
+	return asum;
+}
+
+#if defined(SMP)
+static int zasum_thread_function(BLASLONG n, BLASLONG dummy0,
+	BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+	BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
+{
+	*result = zasum_compute(n, x, inc_x);
+
+	return 0;
+}
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+	int nthreads;
+	FLOAT dummy_alpha;
+#endif
+	FLOAT asum = 0.0;
+
+#if defined(SMP)
+	if (inc_x == 0 || n <= 10000)
+		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
+
+	if (nthreads == 1) {
+		asum = zasum_compute(n, x, inc_x);
+	} else {
+		int mode, i;
+		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
+		FLOAT *ptr;
+
+		mode = BLAS_DOUBLE | BLAS_COMPLEX;
+
+		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
+				   x, inc_x, NULL, 0, result, 0,
+				   ( void *)zasum_thread_function, nthreads);
+
+		ptr = (FLOAT *)result;
+		for (i = 0; i < nthreads; i++) {
+			asum = asum + (*ptr);
+			ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
+		}
+	}
+#else
+	asum = zasum_compute(n, x, inc_x);
+#endif
+
+	return asum;
+}
diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c
index 6f18aa956..5a17d3b19 100644
--- a/kernel/arm64/zsymm_lcopy_sve.c
+++ b/kernel/arm64/zsymm_lcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c
index 6be48cdaf..06989e3aa 100644
--- a/kernel/arm64/zsymm_ucopy_sve.c
+++ b/kernel/arm64/zsymm_ucopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
   svint64_t one_vec = svdup_s64(1LL);
 
   int64_t j = 0;
-  svbool_t pg = svwhilelt_b64(j, n);
+  svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
   int64_t active = svcntp_b64(svptrue_b64(), pg);
   svint64_t index_neg = svindex_s64(0LL, -1LL);
   svint64_t index = svindex_s64(0LL, 1LL);
@@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s64(posX);
     j += sve_size;
-    pg = svwhilelt_b64(j, n);
+    pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
     active = svcntp_b64(svptrue_b64(), pg);
   } while (svptest_any(svptrue_b64(), pg));
 
@@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
 
   int32_t N = n;
   int32_t j = 0;
-  svbool_t pg = svwhilelt_b32(j, N);
+  svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
   int32_t active = svcntp_b32(svptrue_b32(), pg);
   svint32_t index_neg = svindex_s32(0, -1);
   svint32_t index = svindex_s32(0, 1);
@@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     posX += sve_size;
     posX_vec = svdup_s32(posX);
     j += sve_size;
-    pg = svwhilelt_b32(j, N);
+    pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
     active = svcntp_b32(svptrue_b32(), pg);
   } while (svptest_any(svptrue_b32(), pg));
 
diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c
index d34f607ab..5a7171d9d 100644
--- a/kernel/arm64/ztrmm_lncopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
index 7f34c9857..3a88f26b2 100644
--- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c
index 7eb9452c9..c3dbdcbe3 100644
--- a/kernel/arm64/ztrmm_uncopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
 #ifdef DOUBLE
     svint64_t index = svindex_s64(0LL, lda);
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
     svint32_t index = svindex_s32(0, lda);
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c
index 60c8ff3b4..ddfa7ba4e 100644
--- a/kernel/arm64/ztrmm_utcopy_sve_v1.c
+++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
     FLOAT *ao;
     js = 0;
 #ifdef DOUBLE
-    svbool_t pn = svwhilelt_b64(js, n);
+    svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
-    svbool_t pn = svwhilelt_b32(js, n);
+    svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
     int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
     do
@@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
         posY += n_active;
         js += n_active;
 #ifdef DOUBLE
-        pn = svwhilelt_b64(js, n);
+        pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b64(svptrue_b64(), pn);
     } while (svptest_any(svptrue_b64(), pn));
 #else
-        pn = svwhilelt_b32(js, n);
+        pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
         n_active = svcntp_b32(svptrue_b32(), pn);
     } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c
index eb7cd0294..f81ba14c2 100644
--- a/kernel/arm64/ztrsm_lncopy_sve.c
+++ b/kernel/arm64/ztrsm_lncopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c
index 34dbf8a30..46a11abed 100644
--- a/kernel/arm64/ztrsm_ltcopy_sve.c
+++ b/kernel/arm64/ztrsm_ltcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c
index 92e086b75..436112130 100644
--- a/kernel/arm64/ztrsm_uncopy_sve.c
+++ b/kernel/arm64/ztrsm_uncopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 #ifdef DOUBLE
   int64_t js = 0;
   svint64_t index = svindex_s64(0LL, lda);
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
   svint32_t index = svindex_s32(0, lda);
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c
index ccb942e1b..ddf3e265f 100644
--- a/kernel/arm64/ztrsm_utcopy_sve.c
+++ b/kernel/arm64/ztrsm_utcopy_sve.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project                               */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   jj = offset;
 #ifdef DOUBLE
   int64_t js = 0;
-  svbool_t pn = svwhilelt_b64(js, n);
+  svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
   int n_active = svcntp_b64(svptrue_b64(), pn);
 #else
   int32_t N = n;
   int32_t js = 0;
-  svbool_t pn = svwhilelt_b32(js, N);
+  svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
   int n_active = svcntp_b32(svptrue_b32(), pn);
 #endif
   do {
@@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
     js += n_active;
 #ifdef DOUBLE
-    pn = svwhilelt_b64(js, n);
+    pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
     n_active = svcntp_b64(svptrue_b64(), pn);
   } while (svptest_any(svptrue_b64(), pn));
 #else
-    pn = svwhilelt_b32(js, N);
+    pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
     n_active = svcntp_b32(svptrue_b32(), pn);
   } while (svptest_any(svptrue_b32(), pn));
 #endif
diff --git a/kernel/csky/KERNEL b/kernel/csky/KERNEL
new file mode 100644
index 000000000..afa8a0881
--- /dev/null
+++ b/kernel/csky/KERNEL
@@ -0,0 +1,149 @@
+SAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = ../arm/amax.c
+CAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = ../arm/zamax.c
+
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+SASUMKERNEL  = ../arm/asum.c
+DASUMKERNEL  = ../arm/asum.c
+CASUMKERNEL  = ../arm/zasum.c
+ZASUMKERNEL  = ../arm/zasum.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = ../arm/axpy.c
+CAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = ../arm/zaxpy.c
+
+SCOPYKERNEL  = ../arm/copy.c
+DCOPYKERNEL  = ../arm/copy.c
+CCOPYKERNEL  = ../arm/zcopy.c
+ZCOPYKERNEL  = ../arm/zcopy.c
+
+SDOTKERNEL   = ../arm/dot.c
+DDOTKERNEL   = ../arm/dot.c
+CDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   = ../arm/zdot.c
+DSDOTKERNEL  = ../generic/dot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+
+SSCALKERNEL  = ../arm/scal.c
+DSCALKERNEL  = ../arm/scal.c
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+
+SSWAPKERNEL  = ../arm/swap.c
+DSWAPKERNEL  = ../arm/swap.c
+CSWAPKERNEL  = ../arm/zswap.c
+ZSWAPKERNEL  = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+LSAME_KERNEL	= ../generic/lsame.c
+
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+
diff --git a/kernel/csky/Makefile b/kernel/csky/Makefile
new file mode 100644
index 000000000..520349bd6
--- /dev/null
+++ b/kernel/csky/Makefile
@@ -0,0 +1 @@
+clean ::
diff --git a/kernel/generic/trmmkernel_16x8.c b/kernel/generic/trmmkernel_16x8.c
new file mode 100644
index 000000000..5412eab70
--- /dev/null
+++ b/kernel/generic/trmmkernel_16x8.c
@@ -0,0 +1,3676 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+   FLOAT res0_4;
+   FLOAT res0_5;
+   FLOAT res0_6;
+   FLOAT res0_7;
+
+   FLOAT res0_8;
+   FLOAT res0_9;
+   FLOAT res0_10;
+   FLOAT res0_11;
+   FLOAT res0_12;
+   FLOAT res0_13;
+   FLOAT res0_14;
+   FLOAT res0_15;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+   FLOAT res1_4;
+   FLOAT res1_5;
+   FLOAT res1_6;
+   FLOAT res1_7;
+
+   FLOAT res1_8;
+   FLOAT res1_9;
+   FLOAT res1_10;
+   FLOAT res1_11;
+   FLOAT res1_12;
+   FLOAT res1_13;
+   FLOAT res1_14;
+   FLOAT res1_15;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+   FLOAT res2_4;
+   FLOAT res2_5;
+   FLOAT res2_6;
+   FLOAT res2_7;
+
+   FLOAT res2_8;
+   FLOAT res2_9;
+   FLOAT res2_10;
+   FLOAT res2_11;
+   FLOAT res2_12;
+   FLOAT res2_13;
+   FLOAT res2_14;
+   FLOAT res2_15;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+   FLOAT res3_4;
+   FLOAT res3_5;
+   FLOAT res3_6;
+   FLOAT res3_7;
+
+   FLOAT res3_8;
+   FLOAT res3_9;
+   FLOAT res3_10;
+   FLOAT res3_11;
+   FLOAT res3_12;
+   FLOAT res3_13;
+   FLOAT res3_14;
+   FLOAT res3_15;
+
+   FLOAT res4_0;
+   FLOAT res4_1;
+   FLOAT res4_2;
+   FLOAT res4_3;
+   FLOAT res4_4;
+   FLOAT res4_5;
+   FLOAT res4_6;
+   FLOAT res4_7;
+
+   FLOAT res4_8;
+   FLOAT res4_9;
+   FLOAT res4_10;
+   FLOAT res4_11;
+   FLOAT res4_12;
+   FLOAT res4_13;
+   FLOAT res4_14;
+   FLOAT res4_15;
+
+   FLOAT res5_0;
+   FLOAT res5_1;
+   FLOAT res5_2;
+   FLOAT res5_3;
+   FLOAT res5_4;
+   FLOAT res5_5;
+   FLOAT res5_6;
+   FLOAT res5_7;
+
+   FLOAT res5_8;
+   FLOAT res5_9;
+   FLOAT res5_10;
+   FLOAT res5_11;
+   FLOAT res5_12;
+   FLOAT res5_13;
+   FLOAT res5_14;
+   FLOAT res5_15;
+
+   FLOAT res6_0;
+   FLOAT res6_1;
+   FLOAT res6_2;
+   FLOAT res6_3;
+   FLOAT res6_4;
+   FLOAT res6_5;
+   FLOAT res6_6;
+   FLOAT res6_7;
+
+   FLOAT res6_8;
+   FLOAT res6_9;
+   FLOAT res6_10;
+   FLOAT res6_11;
+   FLOAT res6_12;
+   FLOAT res6_13;
+   FLOAT res6_14;
+   FLOAT res6_15;
+
+   FLOAT res7_0;
+   FLOAT res7_1;
+   FLOAT res7_2;
+   FLOAT res7_3;
+   FLOAT res7_4;
+   FLOAT res7_5;
+   FLOAT res7_6;
+   FLOAT res7_7;
+
+   FLOAT res7_8;
+   FLOAT res7_9;
+   FLOAT res7_10;
+   FLOAT res7_11;
+   FLOAT res7_12;
+   FLOAT res7_13;
+   FLOAT res7_14;
+   FLOAT res7_15;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+   FLOAT b4;
+   FLOAT b5;
+   FLOAT b6;
+   FLOAT b7;
+
+   BLASLONG off, temp;
+
+#if !defined(LEFT)
+   off = -offset;
+#else
+   off = 0;
+#endif
+
+   for (j=0; j<bn/8; j+=1)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C0+2*ldc;
+        C3 = C0+3*ldc;
+        C4 = C0+4*ldc;
+        C5 = C0+5*ldc;
+        C6 = C0+6*ldc;
+        C7 = C0+7*ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res2_8  = 0;
+		res2_9  = 0;
+		res2_10 = 0;
+		res2_11 = 0;
+		res2_12 = 0;
+		res2_13 = 0;
+		res2_14 = 0;
+		res2_15 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res3_8  = 0;
+		res3_9  = 0;
+		res3_10 = 0;
+		res3_11 = 0;
+		res3_12 = 0;
+		res3_13 = 0;
+		res3_14 = 0;
+		res3_15 = 0;
+
+
+		res4_0 = 0;
+		res4_1 = 0;
+		res4_2 = 0;
+		res4_3 = 0;
+		res4_4 = 0;
+		res4_5 = 0;
+		res4_6 = 0;
+		res4_7 = 0;
+
+		res4_8  = 0;
+		res4_9  = 0;
+		res4_10 = 0;
+		res4_11 = 0;
+		res4_12 = 0;
+		res4_13 = 0;
+		res4_14 = 0;
+		res4_15 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		res5_2 = 0;
+		res5_3 = 0;
+		res5_4 = 0;
+		res5_5 = 0;
+		res5_6 = 0;
+		res5_7 = 0;
+
+		res5_8  = 0;
+		res5_9  = 0;
+		res5_10 = 0;
+		res5_11 = 0;
+		res5_12 = 0;
+		res5_13 = 0;
+		res5_14 = 0;
+		res5_15 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+		res6_2 = 0;
+		res6_3 = 0;
+		res6_4 = 0;
+		res6_5 = 0;
+		res6_6 = 0;
+		res6_7 = 0;
+
+		res6_8  = 0;
+		res6_9  = 0;
+		res6_10 = 0;
+		res6_11 = 0;
+		res6_12 = 0;
+		res6_13 = 0;
+		res6_14 = 0;
+		res6_15 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+		res7_2 = 0;
+		res7_3 = 0;
+		res7_4 = 0;
+		res7_5 = 0;
+		res7_6 = 0;
+		res7_7 = 0;
+
+		res7_8  = 0;
+		res7_9  = 0;
+		res7_10 = 0;
+		res7_11 = 0;
+		res7_12 = 0;
+		res7_13 = 0;
+		res7_14 = 0;
+		res7_15 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+			res4_4 += a0*b4;
+			res5_4 += a0*b5;
+			res6_4 += a0*b6;
+			res7_4 += a0*b7;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+			res4_5 += a1*b4;
+			res5_5 += a1*b5;
+			res6_5 += a1*b6;
+			res7_5 += a1*b7;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+			res4_6 += a0*b4;
+			res5_6 += a0*b5;
+			res6_6 += a0*b6;
+			res7_6 += a0*b7;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+			res4_7 += a1*b4;
+			res5_7 += a1*b5;
+			res6_7 += a1*b6;
+			res7_7 += a1*b7;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+			res2_8 += a0*b2;
+			res3_8 += a0*b3;
+			res4_8 += a0*b4;
+			res5_8 += a0*b5;
+			res6_8 += a0*b6;
+			res7_8 += a0*b7;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+			res2_9 += a1*b2;
+			res3_9 += a1*b3;
+			res4_9 += a1*b4;
+			res5_9 += a1*b5;
+			res6_9 += a1*b6;
+			res7_9 += a1*b7;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+			res2_10 += a0*b2;
+			res3_10 += a0*b3;
+			res4_10 += a0*b4;
+			res5_10 += a0*b5;
+			res6_10 += a0*b6;
+			res7_10 += a0*b7;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+			res2_11 += a1*b2;
+			res3_11 += a1*b3;
+			res4_11 += a1*b4;
+			res5_11 += a1*b5;
+			res6_11 += a1*b6;
+			res7_11 += a1*b7;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+			res2_12 += a0*b2;
+			res3_12 += a0*b3;
+			res4_12 += a0*b4;
+			res5_12 += a0*b5;
+			res6_12 += a0*b6;
+			res7_12 += a0*b7;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+			res2_13 += a1*b2;
+			res3_13 += a1*b3;
+			res4_13 += a1*b4;
+			res5_13 += a1*b5;
+			res6_13 += a1*b6;
+			res7_13 += a1*b7;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+			res2_14 += a0*b2;
+			res3_14 += a0*b3;
+			res4_14 += a0*b4;
+			res5_14 += a0*b5;
+			res6_14 += a0*b6;
+			res7_14 += a0*b7;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+			res2_15 += a1*b2;
+			res3_15 += a1*b3;
+			res4_15 += a1*b4;
+			res5_15 += a1*b5;
+			res6_15 += a1*b6;
+			res7_15 += a1*b7;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res2_8  *= alpha;
+		res2_9  *= alpha;
+		res2_10 *= alpha;
+		res2_11 *= alpha;
+		res2_12 *= alpha;
+		res2_13 *= alpha;
+		res2_14 *= alpha;
+		res2_15 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res3_8  *= alpha;
+		res3_9  *= alpha;
+		res3_10 *= alpha;
+		res3_11 *= alpha;
+		res3_12 *= alpha;
+		res3_13 *= alpha;
+		res3_14 *= alpha;
+		res3_15 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+		res4_4 *= alpha;
+		res4_5 *= alpha;
+		res4_6 *= alpha;
+		res4_7 *= alpha;
+
+		res4_8  *= alpha;
+		res4_9  *= alpha;
+		res4_10 *= alpha;
+		res4_11 *= alpha;
+		res4_12 *= alpha;
+		res4_13 *= alpha;
+		res4_14 *= alpha;
+		res4_15 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+		res5_4 *= alpha;
+		res5_5 *= alpha;
+		res5_6 *= alpha;
+		res5_7 *= alpha;
+
+		res5_8  *= alpha;
+		res5_9  *= alpha;
+		res5_10 *= alpha;
+		res5_11 *= alpha;
+		res5_12 *= alpha;
+		res5_13 *= alpha;
+		res5_14 *= alpha;
+		res5_15 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+		res6_4 *= alpha;
+		res6_5 *= alpha;
+		res6_6 *= alpha;
+		res6_7 *= alpha;
+
+		res6_8  *= alpha;
+		res6_9  *= alpha;
+		res6_10 *= alpha;
+		res6_11 *= alpha;
+		res6_12 *= alpha;
+		res6_13 *= alpha;
+		res6_14 *= alpha;
+		res6_15 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+		res7_4 *= alpha;
+		res7_5 *= alpha;
+		res7_6 *= alpha;
+		res7_7 *= alpha;
+
+		res7_8  *= alpha;
+		res7_9  *= alpha;
+		res7_10 *= alpha;
+		res7_11 *= alpha;
+		res7_12 *= alpha;
+		res7_13 *= alpha;
+		res7_14 *= alpha;
+		res7_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C2[8]  = res2_8;
+		C2[9]  = res2_9;
+		C2[10] = res2_10;
+		C2[11] = res2_11;
+		C2[12] = res2_12;
+		C2[13] = res2_13;
+		C2[14] = res2_14;
+		C2[15] = res2_15;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C3[8]  = res3_8;
+		C3[9]  = res3_9;
+		C3[10] = res3_10;
+		C3[11] = res3_11;
+		C3[12] = res3_12;
+		C3[13] = res3_13;
+		C3[14] = res3_14;
+		C3[15] = res3_15;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+		C4[4] = res4_4;
+		C4[5] = res4_5;
+		C4[6] = res4_6;
+		C4[7] = res4_7;
+
+		C4[8]  = res4_8;
+		C4[9]  = res4_9;
+		C4[10] = res4_10;
+		C4[11] = res4_11;
+		C4[12] = res4_12;
+		C4[13] = res4_13;
+		C4[14] = res4_14;
+		C4[15] = res4_15;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+		C5[4] = res5_4;
+		C5[5] = res5_5;
+		C5[6] = res5_6;
+		C5[7] = res5_7;
+
+		C5[8]  = res5_8;
+		C5[9]  = res5_9;
+		C5[10] = res5_10;
+		C5[11] = res5_11;
+		C5[12] = res5_12;
+		C5[13] = res5_13;
+		C5[14] = res5_14;
+		C5[15] = res5_15;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+		C6[4] = res6_4;
+		C6[5] = res6_5;
+		C6[6] = res6_6;
+		C6[7] = res6_7;
+
+		C6[8]  = res6_8;
+		C6[9]  = res6_9;
+		C6[10] = res6_10;
+		C6[11] = res6_11;
+		C6[12] = res6_12;
+		C6[13] = res6_13;
+		C6[14] = res6_14;
+		C6[15] = res6_15;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+		C7[4] = res7_4;
+		C7[5] = res7_5;
+		C7[6] = res7_6;
+		C7[7] = res7_7;
+
+		C7[8]  = res7_8;
+		C7[9]  = res7_9;
+		C7[10] = res7_10;
+		C7[11] = res7_11;
+		C7[12] = res7_12;
+		C7[13] = res7_13;
+		C7[14] = res7_14;
+		C7[15] = res7_15;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+		C2 = C2+16;
+		C3 = C3+16;
+		C4 = C4+16;
+		C5 = C5+16;
+		C6 = C6+16;
+		C7 = C7+16;
+	}
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+		res4_2 = 0;
+		res4_3 = 0;
+		res4_4 = 0;
+		res4_5 = 0;
+		res4_6 = 0;
+		res4_7 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		res5_2 = 0;
+		res5_3 = 0;
+		res5_4 = 0;
+		res5_5 = 0;
+		res5_6 = 0;
+		res5_7 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+		res6_2 = 0;
+		res6_3 = 0;
+		res6_4 = 0;
+		res6_5 = 0;
+		res6_6 = 0;
+		res6_7 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+		res7_2 = 0;
+		res7_3 = 0;
+		res7_4 = 0;
+		res7_5 = 0;
+		res7_6 = 0;
+		res7_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+			res4_4 += a0*b4;
+			res5_4 += a0*b5;
+			res6_4 += a0*b6;
+			res7_4 += a0*b7;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+			res4_5 += a1*b4;
+			res5_5 += a1*b5;
+			res6_5 += a1*b6;
+			res7_5 += a1*b7;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+			res4_6 += a0*b4;
+			res5_6 += a0*b5;
+			res6_6 += a0*b6;
+			res7_6 += a0*b7;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+			res4_7 += a1*b4;
+			res5_7 += a1*b5;
+			res6_7 += a1*b6;
+			res7_7 += a1*b7;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+		res4_4 *= alpha;
+		res4_5 *= alpha;
+		res4_6 *= alpha;
+		res4_7 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+		res5_4 *= alpha;
+		res5_5 *= alpha;
+		res5_6 *= alpha;
+		res5_7 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+		res6_4 *= alpha;
+		res6_5 *= alpha;
+		res6_6 *= alpha;
+		res6_7 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+		res7_4 *= alpha;
+		res7_5 *= alpha;
+		res7_6 *= alpha;
+		res7_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+		C4[4] = res4_4;
+		C4[5] = res4_5;
+		C4[6] = res4_6;
+		C4[7] = res4_7;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+		C5[4] = res5_4;
+		C5[5] = res5_5;
+		C5[6] = res5_6;
+		C5[7] = res5_7;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+		C6[4] = res6_4;
+		C6[5] = res6_5;
+		C6[6] = res6_6;
+		C6[7] = res6_7;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+		C7[4] = res7_4;
+		C7[5] = res7_5;
+		C7[6] = res7_6;
+		C7[7] = res7_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+		C2 = C2+8;
+		C3 = C3+8;
+		C4 = C4+8;
+		C5 = C5+8;
+		C6 = C6+8;
+		C7 = C7+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+		res4_0 = 0;
+		res5_0 = 0;
+		res6_0 = 0;
+		res7_0 = 0;
+
+		res0_1 = 0;
+		res1_1 = 0;
+		res2_1 = 0;
+		res3_1 = 0;
+		res4_1 = 0;
+		res5_1 = 0;
+		res6_1 = 0;
+		res7_1 = 0;
+
+		res0_2 = 0;
+		res1_2 = 0;
+		res2_2 = 0;
+		res3_2 = 0;
+		res4_2 = 0;
+		res5_2 = 0;
+		res6_2 = 0;
+		res7_2 = 0;
+
+		res0_3 = 0;
+		res1_3 = 0;
+		res2_3 = 0;
+		res3_3 = 0;
+		res4_3 = 0;
+		res5_3 = 0;
+		res6_3 = 0;
+		res7_3 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+		C4 = C4+4;
+		C5 = C5+4;
+		C6 = C6+4;
+		C7 = C7+4;
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		
+		res6_0 = 0;
+		res6_1 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+		C4 = C4+2;
+		C5 = C5+2;
+		C6 = C6+2;
+		C7 = C7+2;
+	}
+	
+	if ( bm & 1 )
+	{
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+		res4_0 = 0;
+		res5_0 = 0;
+		res6_0 = 0;
+		res7_0 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+		
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+
+		res4_0 *= alpha;
+
+		res5_0 *= alpha;
+		
+		res6_0 *= alpha;
+
+		res7_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+
+		C4[0] = res4_0;
+
+		C5[0] = res5_0;
+
+		C6[0] = res6_0;
+
+		C7[0] = res7_0;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+		C4 = C4+1;
+		C5 = C5+1;
+		C6 = C6+1;
+		C7 = C7+1;
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 8;
+#endif
+
+        k = (bk<<3);
+        bb = bb+k;
+        i = (ldc<<3);
+        C = C+i;
+    }
+
+
+   if( bn & 4 )
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C0+2*ldc;
+        C3 = C0+3*ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res2_8  = 0;
+		res2_9  = 0;
+		res2_10 = 0;
+		res2_11 = 0;
+		res2_12 = 0;
+		res2_13 = 0;
+		res2_14 = 0;
+		res2_15 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res3_8  = 0;
+		res3_9  = 0;
+		res3_10 = 0;
+		res3_11 = 0;
+		res3_12 = 0;
+		res3_13 = 0;
+		res3_14 = 0;
+		res3_15 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+			res2_8 += a0*b2;
+			res3_8 += a0*b3;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+			res2_9 += a1*b2;
+			res3_9 += a1*b3;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+			res2_10 += a0*b2;
+			res3_10 += a0*b3;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+			res2_11 += a1*b2;
+			res3_11 += a1*b3;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+			res2_12 += a0*b2;
+			res3_12 += a0*b3;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+			res2_13 += a1*b2;
+			res3_13 += a1*b3;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+			res2_14 += a0*b2;
+			res3_14 += a0*b3;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+			res2_15 += a1*b2;
+			res3_15 += a1*b3;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res2_8  *= alpha;
+		res2_9  *= alpha;
+		res2_10 *= alpha;
+		res2_11 *= alpha;
+		res2_12 *= alpha;
+		res2_13 *= alpha;
+		res2_14 *= alpha;
+		res2_15 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res3_8  *= alpha;
+		res3_9  *= alpha;
+		res3_10 *= alpha;
+		res3_11 *= alpha;
+		res3_12 *= alpha;
+		res3_13 *= alpha;
+		res3_14 *= alpha;
+		res3_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C2[8]  = res2_8;
+		C2[9]  = res2_9;
+		C2[10] = res2_10;
+		C2[11] = res2_11;
+		C2[12] = res2_12;
+		C2[13] = res2_13;
+		C2[14] = res2_14;
+		C2[15] = res2_15;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C3[8]  = res3_8;
+		C3[9]  = res3_9;
+		C3[10] = res3_10;
+		C3[11] = res3_11;
+		C3[12] = res3_12;
+		C3[13] = res3_13;
+		C3[14] = res3_14;
+		C3[15] = res3_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+		C2 = C2+16;
+		C3 = C3+16;
+	}
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+4;
+
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+		C2 = C2+8;
+		C3 = C3+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+	}
+	
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+		res0_0 *= alpha;
+		res1_0 *= alpha;
+		res2_0 *= alpha;
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+		C1[0] = res1_0;
+		C2[0] = res2_0;
+		C3[0] = res3_0;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+   if(bn&2)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+	}
+
+
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+   for (j=0; j<(bn&1); j+=1)
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+	}
+
+
+
+
+        if ( bm & 8 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+
+   return 0;
+}
diff --git a/kernel/generic/zhemm_ltcopy_16.c b/kernel/generic/zhemm_ltcopy_16.c
new file mode 100644
index 000000000..8797891ea
--- /dev/null
+++ b/kernel/generic/zhemm_ltcopy_16.c
@@ -0,0 +1,1170 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js, offset;
+
+  FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
+  FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
+  FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
+  FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
+
+  FLOAT *ao1, *ao2,  *ao3,  *ao4,  *ao5,  *ao6,  *ao7,  *ao8;
+  FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
+
+  lda *= 2;
+
+  js = (n >> 4);
+  while (js > 0){
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+    if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
+    if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
+    if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
+    if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
+	if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
+    if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
+    if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
+    if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
+    if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
+    if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
+    if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
+    if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+	  data17 = *(ao9 + 0);
+      data18 = *(ao9 + 1);
+      data19 = *(ao10 + 0);
+      data20 = *(ao10 + 1);
+      data21 = *(ao11 + 0);
+      data22 = *(ao11 + 1);
+      data23 = *(ao12 + 0);
+      data24 = *(ao12 + 1);
+      data25 = *(ao13 + 0);
+      data26 = *(ao13 + 1);
+      data27 = *(ao14 + 0);
+      data28 = *(ao14 + 1);
+      data29 = *(ao15 + 0);
+      data30 = *(ao15 + 1);
+      data31 = *(ao16 + 0);
+      data32 = *(ao16 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+      if (offset >  -4) ao5 += lda; else ao5 += 2;
+      if (offset >  -5) ao6 += lda; else ao6 += 2;
+      if (offset >  -6) ao7 += lda; else ao7 += 2;
+      if (offset >  -7) ao8 += lda; else ao8 += 2;
+	  if (offset >  -8) ao9 += lda; else ao9 += 2;
+      if (offset >  -9) ao10 += lda; else ao10 += 2;
+      if (offset >  -10) ao11 += lda; else ao11 += 2;
+      if (offset >  -11) ao12 += lda; else ao12 += 2;
+      if (offset >  -12) ao13 += lda; else ao13 += 2;
+      if (offset >  -13) ao14 += lda; else ao14 += 2;
+      if (offset >  -14) ao15 += lda; else ao15 += 2;
+      if (offset >  -15) ao16 += lda; else ao16 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+	b[ 2] = data03;
+	b[ 3] = data04;
+	b[ 4] = data05;
+	b[ 5] = data06;
+	b[ 6] = data07;
+	b[ 7] = data08;
+	b[ 8] = data09;
+	b[ 9] = data10;
+	b[10] = data11;
+	b[11] = data12;
+	b[12] = data13;
+	b[13] = data14;
+	b[14] = data15;
+	b[15] = data16;
+	b[16] = data17;
+	b[17] = data18;
+	b[18] = data19;
+	b[19] = data20;
+	b[20] = data21;
+	b[21] = data22;
+	b[22] = data23;
+	b[23] = data24;
+	b[24] = data25;
+	b[25] = data26;
+	b[26] = data27;
+	b[27] = data28;
+	b[28] = data29;
+	b[29] = data30;
+	b[30] = data31;
+	b[31] = data32;
+      } else
+	if (offset < -15) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	  b[ 2] = data03;
+	  b[ 3] = -data04;
+	  b[ 4] = data05;
+	  b[ 5] = -data06;
+	  b[ 6] = data07;
+	  b[ 7] = -data08;
+	  b[ 8] = data09;
+	  b[ 9] = -data10;
+	  b[10] = data11;
+	  b[11] = -data12;
+	  b[12] = data13;
+	  b[13] = -data14;
+	  b[14] = data15;
+	  b[15] = -data16;
+	b[16] = data17;
+	b[17] = -data18;
+	b[18] = data19;
+	b[19] = -data20;
+	b[20] = data21;
+	b[21] = -data22;
+	b[22] = data23;
+	b[23] = -data24;
+	b[24] = data25;
+	b[25] = -data26;
+	b[26] = data27;
+	b[27] = -data28;
+	b[28] = data29;
+	b[29] = -data30;
+	b[30] = data31;
+	b[31] = -data32;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -4 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = ZERO;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -5 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = ZERO;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -6 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = ZERO;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	  case -7 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = ZERO;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -8 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = ZERO;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -9 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = ZERO;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -10 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = ZERO;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -11 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = ZERO;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -12 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = ZERO;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -13 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = ZERO;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -14 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = ZERO;
+		b[30] = data31;
+		b[31] = data32;
+	    break;
+	case -15 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 32;
+
+      offset --;
+      i --;
+    }
+
+    posX += 16;
+    js --;
+  }
+
+  if (n & 8) {
+	offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+    if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
+    if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
+    if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
+    if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+      if (offset >  -4) ao5 += lda; else ao5 += 2;
+      if (offset >  -5) ao6 += lda; else ao6 += 2;
+      if (offset >  -6) ao7 += lda; else ao7 += 2;
+      if (offset >  -7) ao8 += lda; else ao8 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+	b[ 2] = data03;
+	b[ 3] = data04;
+	b[ 4] = data05;
+	b[ 5] = data06;
+	b[ 6] = data07;
+	b[ 7] = data08;
+	b[ 8] = data09;
+	b[ 9] = data10;
+	b[10] = data11;
+	b[11] = data12;
+	b[12] = data13;
+	b[13] = data14;
+	b[14] = data15;
+	b[15] = data16;
+      } else
+	if (offset < -7) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	  b[ 2] = data03;
+	  b[ 3] = -data04;
+	  b[ 4] = data05;
+	  b[ 5] = -data06;
+	  b[ 6] = data07;
+	  b[ 7] = -data08;
+	  b[ 8] = data09;
+	  b[ 9] = -data10;
+	  b[10] = data11;
+	  b[11] = -data12;
+	  b[12] = data13;
+	  b[13] = -data14;
+	  b[14] = data15;
+	  b[15] = -data16;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -4 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = ZERO;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -5 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = ZERO;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -6 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = ZERO;
+	    b[14] = data15;
+	    b[15] = data16;
+	    break;
+	  case -7 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 16;
+
+      offset --;
+      i --;
+    }
+
+    posX += 8;
+  }
+
+  if (n & 4) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+	b[ 2] = data03;
+	b[ 3] = data04;
+	b[ 4] = data05;
+	b[ 5] = data06;
+	b[ 6] = data07;
+	b[ 7] = data08;
+      } else
+	if (offset < -3) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	  b[ 2] = data03;
+	  b[ 3] = -data04;
+	  b[ 4] = data05;
+	  b[ 5] = -data06;
+	  b[ 6] = data07;
+	  b[ 7] = -data08;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 8;
+
+      offset --;
+      i --;
+    }
+
+    posX += 4;
+  }
+
+  if (n & 2) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+	b[ 2] = data03;
+	b[ 3] = data04;
+      } else
+	if (offset < -1) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	  b[ 2] = data03;
+	  b[ 3] = -data04;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = -data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 4;
+
+      offset --;
+      i --;
+    }
+
+    posX += 2;
+
+  }
+
+  if (n & 1) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = data02;
+      } else
+	if (offset < 0) {
+	  b[ 0] = data01;
+	  b[ 1] = -data02;
+	} else {
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	}
+
+      b += 2;
+
+      offset --;
+      i --;
+    }
+
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/zhemm_utcopy_16.c b/kernel/generic/zhemm_utcopy_16.c
new file mode 100644
index 000000000..822483a83
--- /dev/null
+++ b/kernel/generic/zhemm_utcopy_16.c
@@ -0,0 +1,1168 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js, offset;
+
+  FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
+  FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
+  FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
+  FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
+
+  FLOAT *ao1, *ao2,  *ao3,  *ao4,  *ao5,  *ao6,  *ao7,  *ao8;
+  FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
+
+  lda *= 2;
+
+  js = (n >> 4);
+  while (js > 0){
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+    if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
+    if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
+    if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
+    if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
+	if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda;
+    if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda;
+    if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda;
+    if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda;
+    if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda;
+    if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda;
+    if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda;
+    if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+	  data17 = *(ao9 + 0);
+      data18 = *(ao9 + 1);
+      data19 = *(ao10 + 0);
+      data20 = *(ao10 + 1);
+      data21 = *(ao11 + 0);
+      data22 = *(ao11 + 1);
+      data23 = *(ao12 + 0);
+      data24 = *(ao12 + 1);
+      data25 = *(ao13 + 0);
+      data26 = *(ao13 + 1);
+      data27 = *(ao14 + 0);
+      data28 = *(ao14 + 1);
+      data29 = *(ao15 + 0);
+      data30 = *(ao15 + 1);
+      data31 = *(ao16 + 0);
+      data32 = *(ao16 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+      if (offset >  -4) ao5 += 2; else ao5 += lda;
+      if (offset >  -5) ao6 += 2; else ao6 += lda;
+      if (offset >  -6) ao7 += 2; else ao7 += lda;
+      if (offset >  -7) ao8 += 2; else ao8 += lda;
+	  if (offset >  -8) ao9 += 2; else ao9 += lda;
+      if (offset >  -9) ao10 += 2; else ao10 += lda;
+      if (offset >  -10) ao11 += 2; else ao11 += lda;
+      if (offset >  -11) ao12 += 2; else ao12 += lda;
+      if (offset >  -12) ao13 += 2; else ao13 += lda;
+      if (offset >  -13) ao14 += 2; else ao14 += lda;
+      if (offset >  -14) ao15 += 2; else ao15 += lda;
+      if (offset >  -15) ao16 += 2; else ao16 += lda;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+	b[ 2] = data03;
+	b[ 3] = -data04;
+	b[ 4] = data05;
+	b[ 5] = -data06;
+	b[ 6] = data07;
+	b[ 7] = -data08;
+	b[ 8] = data09;
+	b[ 9] = -data10;
+	b[10] = data11;
+	b[11] = -data12;
+	b[12] = data13;
+	b[13] = -data14;
+	b[14] = data15;
+	b[15] = -data16;
+	b[16] = data17;
+	b[17] = -data18;
+	b[18] = data19;
+	b[19] = -data20;
+	b[20] = data21;
+	b[21] = -data22;
+	b[22] = data23;
+	b[23] = -data24;
+	b[24] = data25;
+	b[25] = -data26;
+	b[26] = data27;
+	b[27] = -data28;
+	b[28] = data29;
+	b[29] = -data30;
+	b[30] = data31;
+	b[31] = -data32;
+      } else
+	if (offset < -15) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	  b[ 2] = data03;
+	  b[ 3] = data04;
+	  b[ 4] = data05;
+	  b[ 5] = data06;
+	  b[ 6] = data07;
+	  b[ 7] = data08;
+	  b[ 8] = data09;
+	  b[ 9] = data10;
+	  b[10] = data11;
+	  b[11] = data12;
+	  b[12] = data13;
+	  b[13] = data14;
+	  b[14] = data15;
+	  b[15] = data16;
+	  b[16] = data17;
+	b[17] = data18;
+	b[18] = data19;
+	b[19] = data20;
+	b[20] = data21;
+	b[21] = data22;
+	b[22] = data23;
+	b[23] = data24;
+	b[24] = data25;
+	b[25] = data26;
+	b[26] = data27;
+	b[27] = data28;
+	b[28] = data29;
+	b[29] = data30;
+	b[30] = data31;
+	b[31] = data32;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -4 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = ZERO;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -5 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = ZERO;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -6 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = ZERO;
+	    b[14] = data15;
+	    b[15] = -data16;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	  case -7 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = ZERO;
+		b[16] = data17;
+		b[17] = -data18;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -8 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = ZERO;
+		b[18] = data19;
+		b[19] = -data20;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -9 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = ZERO;
+		b[20] = data21;
+		b[21] = -data22;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -10 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = ZERO;
+		b[22] = data23;
+		b[23] = -data24;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -11 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = ZERO;
+		b[24] = data25;
+		b[25] = -data26;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -12 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = ZERO;
+		b[26] = data27;
+		b[27] = -data28;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -13 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = ZERO;
+		b[28] = data29;
+		b[29] = -data30;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -14 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = ZERO;
+		b[30] = data31;
+		b[31] = -data32;
+	    break;
+	case -15 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = data16;
+		b[16] = data17;
+		b[17] = data18;
+		b[18] = data19;
+		b[19] = data20;
+		b[20] = data21;
+		b[21] = data22;
+		b[22] = data23;
+		b[23] = data24;
+		b[24] = data25;
+		b[25] = data26;
+		b[26] = data27;
+		b[27] = data28;
+		b[28] = data29;
+		b[29] = data30;
+		b[30] = data31;
+		b[31] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 32;
+
+      offset --;
+      i --;
+    }
+
+    posX += 16;
+    js --;
+  }
+
+  if (n & 8) {
+	offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+    if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
+    if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
+    if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
+    if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
+
+    i     = m;
+
+	while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+
+	  if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+      if (offset >  -4) ao5 += 2; else ao5 += lda;
+      if (offset >  -5) ao6 += 2; else ao6 += lda;
+      if (offset >  -6) ao7 += 2; else ao7 += lda;
+      if (offset >  -7) ao8 += 2; else ao8 += lda;
+
+	  if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+	b[ 2] = data03;
+	b[ 3] = -data04;
+	b[ 4] = data05;
+	b[ 5] = -data06;
+	b[ 6] = data07;
+	b[ 7] = -data08;
+	b[ 8] = data09;
+	b[ 9] = -data10;
+	b[10] = data11;
+	b[11] = -data12;
+	b[12] = data13;
+	b[13] = -data14;
+	b[14] = data15;
+	b[15] = -data16;
+      } else
+	if (offset < -7) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	  b[ 2] = data03;
+	  b[ 3] = data04;
+	  b[ 4] = data05;
+	  b[ 5] = data06;
+	  b[ 6] = data07;
+	  b[ 7] = data08;
+	  b[ 8] = data09;
+	  b[ 9] = data10;
+	  b[10] = data11;
+	  b[11] = data12;
+	  b[12] = data13;
+	  b[13] = data14;
+	  b[14] = data15;
+	  b[15] = data16;
+	} else {
+	switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    b[ 8] = data09;
+	    b[ 9] = -data10;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -4 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = ZERO;
+	    b[10] = data11;
+	    b[11] = -data12;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -5 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = ZERO;
+	    b[12] = data13;
+	    b[13] = -data14;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -6 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = ZERO;
+	    b[14] = data15;
+	    b[15] = -data16;
+	    break;
+	case -7 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = data08;
+	    b[ 8] = data09;
+	    b[ 9] = data10;
+	    b[10] = data11;
+	    b[11] = data12;
+	    b[12] = data13;
+	    b[13] = data14;
+	    b[14] = data15;
+	    b[15] = ZERO;
+	    break;
+		}
+	}
+
+      b += 16;
+
+      offset --;
+      i --;
+    }
+
+    posX += 8;
+  }
+
+  if (n & 4) {
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+	b[ 2] = data03;
+	b[ 3] = -data04;
+	b[ 4] = data05;
+	b[ 5] = -data06;
+	b[ 6] = data07;
+	b[ 7] = -data08;
+      } else
+	if (offset < -3) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	  b[ 2] = data03;
+	  b[ 3] = data04;
+	  b[ 4] = data05;
+	  b[ 5] = data06;
+	  b[ 6] = data07;
+	  b[ 7] = data08;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    b[ 4] = data05;
+	    b[ 5] = -data06;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    break;
+	  case -2 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = ZERO;
+	    b[ 6] = data07;
+	    b[ 7] = -data08;
+	    break;
+	  case -3 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = data04;
+	    b[ 4] = data05;
+	    b[ 5] = data06;
+	    b[ 6] = data07;
+	    b[ 7] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 8;
+
+      offset --;
+      i --;
+    }
+
+    posX += 4;
+  }
+
+  if (n & 2) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+	b[ 2] = data03;
+	b[ 3] = -data04;
+      } else
+	if (offset < -1) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	  b[ 2] = data03;
+	  b[ 3] = data04;
+	} else {
+	  switch (offset) {
+	  case  0 :
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	    b[ 2] = data03;
+	    b[ 3] = -data04;
+	    break;
+	  case -1 :
+	    b[ 0] = data01;
+	    b[ 1] = data02;
+	    b[ 2] = data03;
+	    b[ 3] = ZERO;
+	    break;
+	  }
+	}
+
+      b += 4;
+
+      offset --;
+      i --;
+    }
+
+    posX += 2;
+  }
+
+  if (n & 1) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+
+      if (offset > 0) {
+	b[ 0] = data01;
+	b[ 1] = -data02;
+      } else
+	if (offset < 0) {
+	  b[ 0] = data01;
+	  b[ 1] = data02;
+	} else {
+	    b[ 0] = data01;
+	    b[ 1] = ZERO;
+	}
+
+      b += 2;
+
+      offset --;
+      i --;
+    }
+
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/zimatcopy_cnc.c b/kernel/generic/zimatcopy_cnc.c
index 8e772bd8a..6426cffc0 100644
--- a/kernel/generic/zimatcopy_cnc.c
+++ b/kernel/generic/zimatcopy_cnc.c
@@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a,
 
 	if ( rows <= 0     )  return(0);
 	if ( cols <= 0     )  return(0);
-    if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); 
 
 	aptr = a;
 	lda *= 2;
diff --git a/kernel/generic/zlaswp_ncopy_8.c b/kernel/generic/zlaswp_ncopy_8.c
new file mode 100644
index 000000000..8bd41749d
--- /dev/null
+++ b/kernel/generic/zlaswp_ncopy_8.c
@@ -0,0 +1,1051 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#define a2	(a1 + 2)
+#define a4	(a3 + 2)
+#define a6	(a5 + 2)
+#define a8	(a7 + 2)
+
+int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint *ipiv, FLOAT *buffer){
+
+  BLASLONG i, j, ip1, ip2;
+  blasint *piv;
+  FLOAT *a1, *a3, *a5, *a7;
+  FLOAT *b1, *b2, *b3, *b4;
+  FLOAT *b5, *b6, *b7, *b8;
+  FLOAT A1, A2, A3, A4, A5, A6, A7, A8;
+  FLOAT B1, B2, B3, B4, B5, B6, B7, B8;
+
+  FLOAT A9, A10, A11, A12, A13, A14, A15, A16;
+  FLOAT B9, B10, B11, B12, B13, B14, B15, B16;
+
+  a -= 2;
+  lda *= 2;
+  k1 --;
+
+ ipiv += k1;
+
+  if (n  <= 0) return 0;
+
+  j = (n >> 3);
+  if (j > 0) {
+    do {
+      piv = ipiv;
+
+      a1 = a + (k1 + 1) * 2;
+
+      a3 = a1 + 1 * lda;
+      a5 = a1 + 2 * lda;
+      a7 = a1 + 3 * lda;
+
+      ip1 = *(piv + 0) * 2;
+      ip2 = *(piv + 1) * 2;
+      piv += 2;
+
+      b1 = a + ip1;
+      b2 = a + ip2;
+
+      b3 = b1 + 1 * lda;
+      b4 = b2 + 1 * lda;
+      b5 = b1 + 2 * lda;
+      b6 = b2 + 2 * lda;
+      b7 = b1 + 3 * lda;
+      b8 = b2 + 3 * lda;
+
+      i = ((k2 - k1) >> 1);
+
+      if (i > 0) {
+	do {
+	  ip1 = *(piv + 0) * 2;
+	  ip2 = *(piv + 1) * 2;
+	  piv += 2;
+
+		for( int pass = 0; pass < 2; ++pass ) {
+		  A1  = *(a1 + 0);
+		  A9  = *(a1 + 1);
+		  A2  = *(a2 + 0);
+		  A10 = *(a2 + 1);
+		  A3  = *(a3 + 0);
+		  A11 = *(a3 + 1);
+		  A4  = *(a4 + 0);
+		  A12 = *(a4 + 1);
+		  A5  = *(a5 + 0);
+		  A13 = *(a5 + 1);
+		  A6  = *(a6 + 0);
+		  A14 = *(a6 + 1);
+		  A7  = *(a7 + 0);
+		  A15 = *(a7 + 1);
+		  A8  = *(a8 + 0);
+		  A16 = *(a8 + 1);
+
+		  B1  = *(b1 + 0);
+		  B9  = *(b1 + 1);
+		  B2  = *(b2 + 0);
+		  B10 = *(b2 + 1);
+		  B3  = *(b3 + 0);
+		  B11 = *(b3 + 1);
+		  B4  = *(b4 + 0);
+		  B12 = *(b4 + 1);
+		  B5  = *(b5 + 0);
+		  B13 = *(b5 + 1);
+		  B6  = *(b6 + 0);
+		  B14 = *(b6 + 1);
+		  B7  = *(b7 + 0);
+		  B15 = *(b7 + 1);
+		  B8  = *(b8 + 0);
+		  B16 = *(b8 + 1);
+
+		if (b1 == a1) {
+		    if (b2 == a2) {
+		      *(buffer +  0) = A1;
+		      *(buffer +  1) = A9;
+		      *(buffer +  2) = A3;
+		      *(buffer +  3) = A11;
+		      *(buffer +  4) = A5;
+		      *(buffer +  5) = A13;
+		      *(buffer +  6) = A7;
+		      *(buffer +  7) = A15;
+
+		      *(buffer +  8) = A2;
+		      *(buffer +  9) = A10;
+		      *(buffer + 10) = A4;
+		      *(buffer + 11) = A12;
+		      *(buffer + 12) = A6;
+		      *(buffer + 13) = A14;
+		      *(buffer + 14) = A8;
+		      *(buffer + 15) = A16;
+		    } else {
+		      *(buffer +  0) = A1;
+		      *(buffer +  1) = A9;
+		      *(buffer +  2) = A3;
+		      *(buffer +  3) = A11;
+		      *(buffer +  4) = A5;
+		      *(buffer +  5) = A13;
+		      *(buffer +  6) = A7;
+		      *(buffer +  7) = A15;
+
+		      *(buffer +  8) = B2;
+		      *(buffer +  9) = B10;
+		      *(buffer + 10) = B4;
+		      *(buffer + 11) = B12;
+		      *(buffer + 12) = B6;
+		      *(buffer + 13) = B14;
+		      *(buffer + 14) = B8;
+		      *(buffer + 15) = B16;
+
+		      *(b2 + 0) = A2;
+		      *(b2 + 1) = A10;
+		      *(b4 + 0) = A4;
+		      *(b4 + 1) = A12;
+		      *(b6 + 0) = A6;
+		      *(b6 + 1) = A14;
+		      *(b8 + 0) = A8;
+		      *(b8 + 1) = A16;
+		    }
+		} else
+		  if (b1 == a2) {
+		      if (b2 == a2) {
+			*(buffer +  0) = A2;
+			*(buffer +  1) = A10;
+			*(buffer +  2) = A4;
+			*(buffer +  3) = A12;
+			*(buffer +  4) = A6;
+			*(buffer +  5) = A14;
+			*(buffer +  6) = A8;
+			*(buffer +  7) = A16;
+			*(buffer +  8) = A1;
+			*(buffer +  9) = A9;
+			*(buffer + 10) = A3;
+			*(buffer + 11) = A11;
+			*(buffer + 12) = A5;
+			*(buffer + 13) = A13;
+			*(buffer + 14) = A7;
+			*(buffer + 15) = A15;
+
+		      } else {
+			*(buffer +  0) = A2;
+			*(buffer +  1) = A10;
+			*(buffer +  2) = A4;
+			*(buffer +  3) = A12;
+			*(buffer +  4) = A6;
+			*(buffer +  5) = A14;
+			*(buffer +  6) = A8;
+			*(buffer +  7) = A16;
+			*(buffer +  8) = B2;
+			*(buffer +  9) = B10;
+			*(buffer + 10) = B4;
+			*(buffer + 11) = B12;
+			*(buffer + 12) = B6;
+			*(buffer + 13) = B14;
+			*(buffer + 14) = B8;
+			*(buffer + 15) = B16;
+
+			*(b2 + 0) = A1;
+			*(b2 + 1) = A9;
+			*(b4 + 0) = A3;
+			*(b4 + 1) = A11;
+			*(b6 + 0) = A5;
+			*(b6 + 1) = A13;
+			*(b8 + 0) = A7;
+			*(b8 + 1) = A15;
+		      }
+		  } else {
+		      if (b2 == a2) {
+			*(buffer +  0) = B1;
+			*(buffer +  1) = B9;
+			*(buffer +  2) = B3;
+			*(buffer +  3) = B11;
+			*(buffer +  4) = B5;
+			*(buffer +  5) = B13;
+			*(buffer +  6) = B7;
+			*(buffer +  7) = B15;
+			*(buffer +  8) = A2;
+			*(buffer +  9) = A10;
+			*(buffer + 10) = A4;
+			*(buffer + 11) = A12;
+			*(buffer + 12) = A6;
+			*(buffer + 13) = A14;
+			*(buffer + 14) = A8;
+			*(buffer + 15) = A16;
+
+			*(b1 + 0) = A1;
+			*(b1 + 1) = A9;
+			*(b3 + 0) = A3;
+			*(b3 + 1) = A11;
+			*(b5 + 0) = A5;
+			*(b5 + 1) = A13;
+			*(b7 + 0) = A7;
+			*(b7 + 1) = A15;
+		      } else
+			if (b2 == b1) {
+			  *(buffer +  0) = B1;
+			  *(buffer +  1) = B9;
+			  *(buffer +  2) = B3;
+			  *(buffer +  3) = B11;
+			  *(buffer +  4) = B5;
+			  *(buffer +  5) = B13;
+			  *(buffer +  6) = B7;
+			  *(buffer +  7) = B15;
+			  *(buffer +  8) = A1;
+			  *(buffer +  9) = A9;
+			  *(buffer + 10) = A3;
+			  *(buffer + 11) = A11;
+			  *(buffer + 12) = A5;
+			  *(buffer + 13) = A13;
+			  *(buffer + 14) = A7;
+			  *(buffer + 15) = A15;
+
+			  *(b1 + 0) = A2;
+			  *(b1 + 1) = A10;
+			  *(b3 + 0) = A4;
+			  *(b3 + 1) = A12;
+			  *(b5 + 0) = A6;
+			  *(b5 + 1) = A14;
+			  *(b7 + 0) = A8;
+			  *(b7 + 1) = A16;
+			} else {
+			  *(buffer +  0) = B1;
+			  *(buffer +  1) = B9;
+			  *(buffer +  2) = B3;
+			  *(buffer +  3) = B11;
+			  *(buffer +  4) = B5;
+			  *(buffer +  5) = B13;
+			  *(buffer +  6) = B7;
+			  *(buffer +  7) = B15;
+			  *(buffer +  8) = B2;
+			  *(buffer +  9) = B10;
+			  *(buffer + 10) = B4;
+			  *(buffer + 11) = B12;
+			  *(buffer + 12) = B6;
+			  *(buffer + 13) = B14;
+			  *(buffer + 14) = B8;
+			  *(buffer + 15) = B16;
+
+			  *(b1 + 0) = A1;
+			  *(b1 + 1) = A9;
+			  *(b2 + 0) = A2;
+			  *(b2 + 1) = A10;
+			  *(b3 + 0) = A3;
+			  *(b3 + 1) = A11;
+			  *(b4 + 0) = A4;
+			  *(b4 + 1) = A12;
+			  *(b5 + 0) = A5;
+			  *(b5 + 1) = A13;
+			  *(b6 + 0) = A6;
+			  *(b6 + 1) = A14;
+			  *(b7 + 0) = A7;
+			  *(b7 + 1) = A15;
+			  *(b8 + 0) = A8;
+			  *(b8 + 1) = A16;
+			}
+	  }
+	  b1 += 4*lda;
+	  b2 += 4*lda;
+	  b3 += 4*lda;
+	  b4 += 4*lda;
+	  b5 += 4*lda;
+	  b6 += 4*lda;
+	  b7 += 4*lda;
+	  b8 += 4*lda;
+
+	  a1 += 4;
+	  a3 += 4;
+	  a5 += 4;
+	  a7 += 4;
+
+	  buffer += 16;
+	}
+
+	  b1 = a + ip1;
+	  b2 = a + ip2;
+
+	  b3 = b1 + 1 * lda;
+	  b4 = b2 + 1 * lda;
+	  b5 = b1 + 2 * lda;
+	  b6 = b2 + 2 * lda;
+	  b7 = b1 + 3 * lda;
+	  b8 = b2 + 3 * lda;
+
+	i --;
+	} while (i > 0);
+      }
+
+      i = ((k2 - k1) & 1);
+
+      if (i > 0) {
+	A1  = *(a1 + 0);
+	A9  = *(a1 + 1);
+	B1  = *(b1 + 0);
+	B9  = *(b1 + 1);
+	A3  = *(a3 + 0);
+	A11 = *(a3 + 1);
+	B3  = *(b3 + 0);
+	B11 = *(b3 + 1);
+	A5  = *(a5 + 0);
+	A13 = *(a5 + 1);
+	B5  = *(b5 + 0);
+	B13 = *(b5 + 1);
+	A7  = *(a7 + 0);
+	A15 = *(a7 + 1);
+	B7  = *(b7 + 0);
+	B15 = *(b7 + 1);
+
+	if (a1 == b1) {
+	  *(buffer + 0) = A1;
+	  *(buffer + 1) = A9;
+	  *(buffer + 2) = A3;
+	  *(buffer + 3) = A11;
+	  *(buffer + 4) = A5;
+	  *(buffer + 5) = A13;
+	  *(buffer + 6) = A7;
+	  *(buffer + 7) = A15;
+	} else {
+	  *(buffer + 0) = B1;
+	  *(buffer + 1) = B9;
+	  *(buffer + 2) = B3;
+	  *(buffer + 3) = B11;
+	  *(buffer + 4) = B5;
+	  *(buffer + 5) = B13;
+	  *(buffer + 6) = B7;
+	  *(buffer + 7) = B15;
+
+	  *(b1 + 0) = A1;
+	  *(b1 + 1) = A9;
+	  *(b3 + 0) = A3;
+	  *(b3 + 1) = A11;
+	  *(b5 + 0) = A5;
+	  *(b5 + 1) = A13;
+	  *(b7 + 0) = A7;
+	  *(b7 + 1) = A15;
+	}
+	buffer += 8;
+      }
+
+      a += 4 * lda;
+
+      j --;
+    } while (j > 0);
+  }
+
+
+  if (n & 4) {
+    {
+      piv = ipiv;
+
+      a1 = a + (k1 + 1) * 2;
+
+      a3 = a1 + 1 * lda;
+      a5 = a1 + 2 * lda;
+      a7 = a1 + 3 * lda;
+
+      ip1 = *(piv + 0) * 2;
+      ip2 = *(piv + 1) * 2;
+      piv += 2;
+
+      b1 = a + ip1;
+      b2 = a + ip2;
+
+      b3 = b1 + 1 * lda;
+      b4 = b2 + 1 * lda;
+      b5 = b1 + 2 * lda;
+      b6 = b2 + 2 * lda;
+      b7 = b1 + 3 * lda;
+      b8 = b2 + 3 * lda;
+
+      i = ((k2 - k1) >> 1);
+
+      if (i > 0) {
+	do {
+	  A1  = *(a1 + 0);
+	  A9  = *(a1 + 1);
+	  A2  = *(a2 + 0);
+	  A10 = *(a2 + 1);
+	  A3  = *(a3 + 0);
+	  A11 = *(a3 + 1);
+	  A4  = *(a4 + 0);
+	  A12 = *(a4 + 1);
+	  A5  = *(a5 + 0);
+	  A13 = *(a5 + 1);
+	  A6  = *(a6 + 0);
+	  A14 = *(a6 + 1);
+	  A7  = *(a7 + 0);
+	  A15 = *(a7 + 1);
+	  A8  = *(a8 + 0);
+	  A16 = *(a8 + 1);
+
+	  B1  = *(b1 + 0);
+	  B9  = *(b1 + 1);
+	  B2  = *(b2 + 0);
+	  B10 = *(b2 + 1);
+	  B3  = *(b3 + 0);
+	  B11 = *(b3 + 1);
+	  B4  = *(b4 + 0);
+	  B12 = *(b4 + 1);
+	  B5  = *(b5 + 0);
+	  B13 = *(b5 + 1);
+	  B6  = *(b6 + 0);
+	  B14 = *(b6 + 1);
+	  B7  = *(b7 + 0);
+	  B15 = *(b7 + 1);
+	  B8  = *(b8 + 0);
+	  B16 = *(b8 + 1);
+
+	  ip1 = *(piv + 0) * 2;
+	  ip2 = *(piv + 1) * 2;
+	  piv += 2;
+
+	if (b1 == a1) {
+	    if (b2 == a2) {
+	      *(buffer +  0) = A1;
+	      *(buffer +  1) = A9;
+	      *(buffer +  2) = A3;
+	      *(buffer +  3) = A11;
+	      *(buffer +  4) = A5;
+	      *(buffer +  5) = A13;
+	      *(buffer +  6) = A7;
+	      *(buffer +  7) = A15;
+
+	      *(buffer +  8) = A2;
+	      *(buffer +  9) = A10;
+	      *(buffer + 10) = A4;
+	      *(buffer + 11) = A12;
+	      *(buffer + 12) = A6;
+	      *(buffer + 13) = A14;
+	      *(buffer + 14) = A8;
+	      *(buffer + 15) = A16;
+	    } else {
+	      *(buffer +  0) = A1;
+	      *(buffer +  1) = A9;
+	      *(buffer +  2) = A3;
+	      *(buffer +  3) = A11;
+	      *(buffer +  4) = A5;
+	      *(buffer +  5) = A13;
+	      *(buffer +  6) = A7;
+	      *(buffer +  7) = A15;
+
+	      *(buffer +  8) = B2;
+	      *(buffer +  9) = B10;
+	      *(buffer + 10) = B4;
+	      *(buffer + 11) = B12;
+	      *(buffer + 12) = B6;
+	      *(buffer + 13) = B14;
+	      *(buffer + 14) = B8;
+	      *(buffer + 15) = B16;
+
+	      *(b2 + 0) = A2;
+	      *(b2 + 1) = A10;
+	      *(b4 + 0) = A4;
+	      *(b4 + 1) = A12;
+	      *(b6 + 0) = A6;
+	      *(b6 + 1) = A14;
+	      *(b8 + 0) = A8;
+	      *(b8 + 1) = A16;
+	    }
+	} else
+	  if (b1 == a2) {
+	      if (b2 == a2) {
+		*(buffer +  0) = A2;
+		*(buffer +  1) = A10;
+		*(buffer +  2) = A4;
+		*(buffer +  3) = A12;
+		*(buffer +  4) = A6;
+		*(buffer +  5) = A14;
+		*(buffer +  6) = A8;
+		*(buffer +  7) = A16;
+		*(buffer +  8) = A1;
+		*(buffer +  9) = A9;
+		*(buffer + 10) = A3;
+		*(buffer + 11) = A11;
+		*(buffer + 12) = A5;
+		*(buffer + 13) = A13;
+		*(buffer + 14) = A7;
+		*(buffer + 15) = A15;
+
+	      } else {
+		*(buffer +  0) = A2;
+		*(buffer +  1) = A10;
+		*(buffer +  2) = A4;
+		*(buffer +  3) = A12;
+		*(buffer +  4) = A6;
+		*(buffer +  5) = A14;
+		*(buffer +  6) = A8;
+		*(buffer +  7) = A16;
+		*(buffer +  8) = B2;
+		*(buffer +  9) = B10;
+		*(buffer + 10) = B4;
+		*(buffer + 11) = B12;
+		*(buffer + 12) = B6;
+		*(buffer + 13) = B14;
+		*(buffer + 14) = B8;
+		*(buffer + 15) = B16;
+
+		*(b2 + 0) = A1;
+		*(b2 + 1) = A9;
+		*(b4 + 0) = A3;
+		*(b4 + 1) = A11;
+		*(b6 + 0) = A5;
+		*(b6 + 1) = A13;
+		*(b8 + 0) = A7;
+		*(b8 + 1) = A15;
+	      }
+	  } else {
+	      if (b2 == a2) {
+		*(buffer +  0) = B1;
+		*(buffer +  1) = B9;
+		*(buffer +  2) = B3;
+		*(buffer +  3) = B11;
+		*(buffer +  4) = B5;
+		*(buffer +  5) = B13;
+		*(buffer +  6) = B7;
+		*(buffer +  7) = B15;
+		*(buffer +  8) = A2;
+		*(buffer +  9) = A10;
+		*(buffer + 10) = A4;
+		*(buffer + 11) = A12;
+		*(buffer + 12) = A6;
+		*(buffer + 13) = A14;
+		*(buffer + 14) = A8;
+		*(buffer + 15) = A16;
+
+		*(b1 + 0) = A1;
+		*(b1 + 1) = A9;
+		*(b3 + 0) = A3;
+		*(b3 + 1) = A11;
+		*(b5 + 0) = A5;
+		*(b5 + 1) = A13;
+		*(b7 + 0) = A7;
+		*(b7 + 1) = A15;
+	      } else
+		if (b2 == b1) {
+		  *(buffer +  0) = B1;
+		  *(buffer +  1) = B9;
+		  *(buffer +  2) = B3;
+		  *(buffer +  3) = B11;
+		  *(buffer +  4) = B5;
+		  *(buffer +  5) = B13;
+		  *(buffer +  6) = B7;
+		  *(buffer +  7) = B15;
+		  *(buffer +  8) = A1;
+		  *(buffer +  9) = A9;
+		  *(buffer + 10) = A3;
+		  *(buffer + 11) = A11;
+		  *(buffer + 12) = A5;
+		  *(buffer + 13) = A13;
+		  *(buffer + 14) = A7;
+		  *(buffer + 15) = A15;
+
+		  *(b1 + 0) = A2;
+		  *(b1 + 1) = A10;
+		  *(b3 + 0) = A4;
+		  *(b3 + 1) = A12;
+		  *(b5 + 0) = A6;
+		  *(b5 + 1) = A14;
+		  *(b7 + 0) = A8;
+		  *(b7 + 1) = A16;
+		} else {
+		  *(buffer +  0) = B1;
+		  *(buffer +  1) = B9;
+		  *(buffer +  2) = B3;
+		  *(buffer +  3) = B11;
+		  *(buffer +  4) = B5;
+		  *(buffer +  5) = B13;
+		  *(buffer +  6) = B7;
+		  *(buffer +  7) = B15;
+		  *(buffer +  8) = B2;
+		  *(buffer +  9) = B10;
+		  *(buffer + 10) = B4;
+		  *(buffer + 11) = B12;
+		  *(buffer + 12) = B6;
+		  *(buffer + 13) = B14;
+		  *(buffer + 14) = B8;
+		  *(buffer + 15) = B16;
+
+		  *(b1 + 0) = A1;
+		  *(b1 + 1) = A9;
+		  *(b2 + 0) = A2;
+		  *(b2 + 1) = A10;
+		  *(b3 + 0) = A3;
+		  *(b3 + 1) = A11;
+		  *(b4 + 0) = A4;
+		  *(b4 + 1) = A12;
+		  *(b5 + 0) = A5;
+		  *(b5 + 1) = A13;
+		  *(b6 + 0) = A6;
+		  *(b6 + 1) = A14;
+		  *(b7 + 0) = A7;
+		  *(b7 + 1) = A15;
+		  *(b8 + 0) = A8;
+		  *(b8 + 1) = A16;
+		}
+	  }
+
+	 buffer += 16;
+
+	  b1 = a + ip1;
+	  b2 = a + ip2;
+
+	  b3 = b1 + 1 * lda;
+	  b4 = b2 + 1 * lda;
+	  b5 = b1 + 2 * lda;
+	  b6 = b2 + 2 * lda;
+	  b7 = b1 + 3 * lda;
+	  b8 = b2 + 3 * lda;
+
+	  a1 += 4;
+	  a3 += 4;
+	  a5 += 4;
+	  a7 += 4;
+
+	i --;
+	} while (i > 0);
+      }
+
+      i = ((k2 - k1) & 1);
+
+      if (i > 0) {
+	A1  = *(a1 + 0);
+	A9  = *(a1 + 1);
+	B1  = *(b1 + 0);
+	B9  = *(b1 + 1);
+	A3  = *(a3 + 0);
+	A11 = *(a3 + 1);
+	B3  = *(b3 + 0);
+	B11 = *(b3 + 1);
+	A5  = *(a5 + 0);
+	A13 = *(a5 + 1);
+	B5  = *(b5 + 0);
+	B13 = *(b5 + 1);
+	A7  = *(a7 + 0);
+	A15 = *(a7 + 1);
+	B7  = *(b7 + 0);
+	B15 = *(b7 + 1);
+
+	if (a1 == b1) {
+	  *(buffer + 0) = A1;
+	  *(buffer + 1) = A9;
+	  *(buffer + 2) = A3;
+	  *(buffer + 3) = A11;
+	  *(buffer + 4) = A5;
+	  *(buffer + 5) = A13;
+	  *(buffer + 6) = A7;
+	  *(buffer + 7) = A15;
+	} else {
+	  *(buffer + 0) = B1;
+	  *(buffer + 1) = B9;
+	  *(buffer + 2) = B3;
+	  *(buffer + 3) = B11;
+	  *(buffer + 4) = B5;
+	  *(buffer + 5) = B13;
+	  *(buffer + 6) = B7;
+	  *(buffer + 7) = B15;
+
+	  *(b1 + 0) = A1;
+	  *(b1 + 1) = A9;
+	  *(b3 + 0) = A3;
+	  *(b3 + 1) = A11;
+	  *(b5 + 0) = A5;
+	  *(b5 + 1) = A13;
+	  *(b7 + 0) = A7;
+	  *(b7 + 1) = A15;
+	}
+	buffer += 8;
+      }
+
+      a += 4 * lda;
+    }
+  } //if (n & 4)
+
+  if (n & 2) {
+    piv = ipiv;
+
+    a1 = a + (k1 + 1) * 2;
+    a3 = a1 + lda;
+
+    ip1 = *(piv + 0) * 2;
+    ip2 = *(piv + 1) * 2;
+    piv += 2;
+
+    b1 = a + ip1;
+    b2 = a + ip2;
+
+    b3 = b1 + lda;
+    b4 = b2 + lda;
+
+    i = ((k2 - k1) >> 1);
+
+    if (i > 0) {
+      do {
+	A1 = *(a1 + 0);
+	A2 = *(a1 + 1);
+	A3 = *(a2 + 0);
+	A4 = *(a2 + 1);
+	A5 = *(a3 + 0);
+	A6 = *(a3 + 1);
+	A7 = *(a4 + 0);
+	A8 = *(a4 + 1);
+
+	B1 = *(b1 + 0);
+	B2 = *(b1 + 1);
+	B3 = *(b2 + 0);
+	B4 = *(b2 + 1);
+	B5 = *(b3 + 0);
+	B6 = *(b3 + 1);
+	B7 = *(b4 + 0);
+	B8 = *(b4 + 1);
+
+	ip1 = *(piv + 0) * 2;
+	ip2 = *(piv + 1) * 2;
+	piv += 2;
+
+	if (b1 == a1) {
+	  if (b2 == a2) {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = A5;
+	    *(buffer + 3) = A6;
+	    *(buffer + 4) = A3;
+	    *(buffer + 5) = A4;
+	    *(buffer + 6) = A7;
+	    *(buffer + 7) = A8;
+	  } else {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = A5;
+	    *(buffer + 3) = A6;
+	    *(buffer + 4) = B3;
+	    *(buffer + 5) = B4;
+	    *(buffer + 6) = B7;
+	    *(buffer + 7) = B8;
+
+	    *(b2 + 0) = A3;
+	    *(b2 + 1) = A4;
+	    *(b4 + 0) = A7;
+	    *(b4 + 1) = A8;
+	  }
+	} else {
+	  if (b1 == a2) {
+	    if (b2 == a2) {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = A7;
+	      *(buffer + 3) = A8;
+	      *(buffer + 4) = A1;
+	      *(buffer + 5) = A2;
+	      *(buffer + 6) = A5;
+	      *(buffer + 7) = A6;
+	    } else {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = A7;
+	      *(buffer + 3) = A8;
+	      *(buffer + 4) = B3;
+	      *(buffer + 5) = B4;
+	      *(buffer + 6) = B7;
+	      *(buffer + 7) = B8;
+
+	      *(b2 + 0) = A1;
+	      *(b2 + 1) = A2;
+	      *(b4 + 0) = A5;
+	      *(b4 + 1) = A6;
+	    }
+	  } else {
+	    if (b2 == a2) {
+	      *(buffer + 0) = B1;
+	      *(buffer + 1) = B2;
+	      *(buffer + 2) = B5;
+	      *(buffer + 3) = B6;
+	      *(buffer + 4) = A3;
+	      *(buffer + 5) = A4;
+	      *(buffer + 6) = A7;
+	      *(buffer + 7) = A8;
+
+	      *(b1 + 0) = A1;
+	      *(b1 + 1) = A2;
+	      *(b3 + 0) = A5;
+	      *(b3 + 1) = A6;
+	    } else {
+	      if (b2 == b1) {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = B5;
+		*(buffer + 3) = B6;
+		*(buffer + 4) = A1;
+		*(buffer + 5) = A2;
+		*(buffer + 6) = A5;
+		*(buffer + 7) = A6;
+
+		*(b1 + 0) = A3;
+		*(b1 + 1) = A4;
+		*(b3 + 0) = A7;
+		*(b3 + 1) = A8;
+	      } else {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = B5;
+		*(buffer + 3) = B6;
+		*(buffer + 4) = B3;
+		*(buffer + 5) = B4;
+		*(buffer + 6) = B7;
+		*(buffer + 7) = B8;
+		*(b1 + 0) = A1;
+		*(b1 + 1) = A2;
+		*(b2 + 0) = A3;
+		*(b2 + 1) = A4;
+		*(b3 + 0) = A5;
+		*(b3 + 1) = A6;
+		*(b4 + 0) = A7;
+		*(b4 + 1) = A8;
+	      }
+	    }
+	  }
+	  }
+
+	  buffer += 8;
+
+	  b1 = a + ip1;
+	  b2 = a + ip2;
+
+	  b3 = b1 + lda;
+	  b4 = b2 + lda;
+
+	  a1 += 4;
+	  a3 += 4;
+
+	  i --;
+      } while (i > 0);
+    }
+
+    i = ((k2 - k1) & 1);
+
+    if (i > 0) {
+      A1 = *(a1 + 0);
+      A2 = *(a1 + 1);
+      B1 = *(b1 + 0);
+      B2 = *(b1 + 1);
+      A3 = *(a3 + 0);
+      A4 = *(a3 + 1);
+      B3 = *(b3 + 0);
+      B4 = *(b3 + 1);
+
+      if (a1 == b1) {
+	*(buffer + 0) = A1;
+	*(buffer + 1) = A2;
+	*(buffer + 2) = A3;
+	*(buffer + 3) = A4;
+
+      } else {
+	*(buffer + 0) = B1;
+	*(buffer + 1) = B2;
+	*(buffer + 2) = B3;
+	*(buffer + 3) = B4;
+	*(b1 + 0) = A1;
+	*(b1 + 1) = A2;
+	*(b3 + 0) = A3;
+	*(b3 + 1) = A4;
+      }
+      buffer += 4;
+    }
+
+    a += 2 * lda;
+  }
+
+  if (n & 1) {
+    piv = ipiv;
+
+    a1 = a + (k1 + 1) * 2;
+
+    ip1 = *(piv + 0) * 2;
+    ip2 = *(piv + 1) * 2;
+    piv += 2;
+
+    b1 = a + ip1;
+    b2 = a + ip2;
+
+    i = ((k2 - k1) >> 1);
+
+    if (i > 0) {
+      do {
+	A1 = *(a1 + 0);
+	A2 = *(a1 + 1);
+	A3 = *(a2 + 0);
+	A4 = *(a2 + 1);
+	B1 = *(b1 + 0);
+	B2 = *(b1 + 1);
+	B3 = *(b2 + 0);
+	B4 = *(b2 + 1);
+
+	ip1 = *(piv + 0) * 2;
+	ip2 = *(piv + 1) * 2;
+	piv += 2;
+
+	if (b1 == a1) {
+	  if (b2 == a2) {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = A3;
+	    *(buffer + 3) = A4;
+	  } else {
+	    *(buffer + 0) = A1;
+	    *(buffer + 1) = A2;
+	    *(buffer + 2) = B3;
+	    *(buffer + 3) = B4;
+
+	    *(b2 + 0) = A3;
+	    *(b2 + 1) = A4;
+	  }
+	} else
+	  if (b1 == a2) {
+	    if (b2 == a2) {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = A1;
+	      *(buffer + 3) = A2;
+	    } else {
+	      *(buffer + 0) = A3;
+	      *(buffer + 1) = A4;
+	      *(buffer + 2) = B3;
+	      *(buffer + 3) = B4;
+	      *(b2 + 0) = A1;
+	      *(b2 + 1) = A2;
+	    }
+	  } else {
+	    if (b2 == a2) {
+	      *(buffer + 0) = B1;
+	      *(buffer + 1) = B2;
+	      *(buffer + 2) = A3;
+	      *(buffer + 3) = A4;
+	      *(b1 + 0) = A1;
+	      *(b1 + 1) = A2;
+	    } else
+	      if (b2 == b1) {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = A1;
+		*(buffer + 3) = A2;
+		*(b1 + 0) = A3;
+		*(b1 + 1) = A4;
+	      } else {
+		*(buffer + 0) = B1;
+		*(buffer + 1) = B2;
+		*(buffer + 2) = B3;
+		*(buffer + 3) = B4;
+		*(b1 + 0) = A1;
+		*(b1 + 1) = A2;
+		*(b2 + 0) = A3;
+		*(b2 + 1) = A4;
+	      }
+	  }
+
+	buffer += 4;
+
+	b1 = a + ip1;
+	b2 = a + ip2;
+
+	a1 += 4;
+
+	i --;
+      } while (i > 0);
+    }
+
+    i = ((k2 - k1) & 1);
+
+    if (i > 0) {
+      A1 = *(a1 + 0);
+      A2 = *(a1 + 1);
+      B1 = *(b1 + 0);
+      B2 = *(b1 + 1);
+
+      if (a1 == b1) {
+	*(buffer + 0) = A1;
+	*(buffer + 1) = A2;
+      } else {
+	*(buffer + 0) = B1;
+	*(buffer + 1) = B2;
+	*(b1 + 0) = A1;
+	*(b1 + 1) = A2;
+      }
+      // buffer += 2;
+    }
+  }
+
+  return 0;
+}
+
diff --git a/kernel/generic/zneg_tcopy_16.c b/kernel/generic/zneg_tcopy_16.c
new file mode 100644
index 000000000..50f5a3d37
--- /dev/null
+++ b/kernel/generic/zneg_tcopy_16.c
@@ -0,0 +1,587 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+
+  BLASLONG i, j;
+
+  FLOAT *aoffset;
+  FLOAT *aoffset1, *aoffset2;
+
+  FLOAT *boffset;
+
+  FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+  FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
+  FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
+  FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
+  FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
+  FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
+
+  FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
+  FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
+  FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
+  FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
+  FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
+  FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
+  FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
+  FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
+
+  aoffset   = a;
+  boffset   = b;
+  lda *= 2;
+
+#if 0
+  fprintf(stderr, "M = %d N = %d\n", m, n);
+#endif
+
+  j = (n >> 4);
+  if (j > 0){
+    do{
+      aoffset1  = aoffset;
+      aoffset2  = aoffset + lda;
+      aoffset += 32;
+
+      i = (m >> 1);
+      if (i > 0){
+	do{
+	  ctemp01 = *(aoffset1 +  0);
+	  ctemp02 = *(aoffset1 +  1);
+	  ctemp03 = *(aoffset1 +  2);
+	  ctemp04 = *(aoffset1 +  3);
+	  ctemp05 = *(aoffset1 +  4);
+	  ctemp06 = *(aoffset1 +  5);
+	  ctemp07 = *(aoffset1 +  6);
+	  ctemp08 = *(aoffset1 +  7);
+	  ctemp09 = *(aoffset1 +  8);
+	  ctemp10 = *(aoffset1 +  9);
+	  ctemp11 = *(aoffset1 + 10);
+	  ctemp12 = *(aoffset1 + 11);
+	  ctemp13 = *(aoffset1 + 12);
+	  ctemp14 = *(aoffset1 + 13);
+	  ctemp15 = *(aoffset1 + 14);
+	  ctemp16 = *(aoffset1 + 15);
+	  ctemp17 = *(aoffset1 + 16);
+	  ctemp18 = *(aoffset1 + 17);
+	  ctemp19 = *(aoffset1 + 18);
+	  ctemp20 = *(aoffset1 + 19);
+	  ctemp21 = *(aoffset1 + 20);
+	  ctemp22 = *(aoffset1 + 21);
+	  ctemp23 = *(aoffset1 + 22);
+	  ctemp24 = *(aoffset1 + 23);
+	  ctemp25 = *(aoffset1 + 24);
+	  ctemp26 = *(aoffset1 + 25);
+	  ctemp27 = *(aoffset1 + 26);
+	  ctemp28 = *(aoffset1 + 27);
+	  ctemp29 = *(aoffset1 + 28);
+	  ctemp30 = *(aoffset1 + 29);
+	  ctemp31 = *(aoffset1 + 30);
+	  ctemp32 = *(aoffset1 + 31);
+
+	  ctemp33 = *(aoffset2 +  0);
+	  ctemp34 = *(aoffset2 +  1);
+	  ctemp35 = *(aoffset2 +  2);
+	  ctemp36 = *(aoffset2 +  3);
+	  ctemp37 = *(aoffset2 +  4);
+	  ctemp38 = *(aoffset2 +  5);
+	  ctemp39 = *(aoffset2 +  6);
+	  ctemp40 = *(aoffset2 +  7);
+	  ctemp41 = *(aoffset2 +  8);
+	  ctemp42 = *(aoffset2 +  9);
+	  ctemp43 = *(aoffset2 + 10);
+	  ctemp44 = *(aoffset2 + 11);
+	  ctemp45 = *(aoffset2 + 12);
+	  ctemp46 = *(aoffset2 + 13);
+	  ctemp47 = *(aoffset2 + 14);
+	  ctemp48 = *(aoffset2 + 15);
+	  ctemp49 = *(aoffset2 + 16);
+	  ctemp50 = *(aoffset2 + 17);
+	  ctemp51 = *(aoffset2 + 18);
+	  ctemp52 = *(aoffset2 + 19);
+	  ctemp53 = *(aoffset2 + 20);
+	  ctemp54 = *(aoffset2 + 21);
+	  ctemp55 = *(aoffset2 + 22);
+	  ctemp56 = *(aoffset2 + 23);
+	  ctemp57 = *(aoffset2 + 24);
+	  ctemp58 = *(aoffset2 + 25);
+	  ctemp59 = *(aoffset2 + 26);
+	  ctemp60 = *(aoffset2 + 27);
+	  ctemp61 = *(aoffset2 + 28);
+	  ctemp62 = *(aoffset2 + 29);
+	  ctemp63 = *(aoffset2 + 30);
+	  ctemp64 = *(aoffset2 + 31);
+
+	  *(boffset +  0) = -ctemp01;
+	  *(boffset +  1) = -ctemp02;
+	  *(boffset +  2) = -ctemp03;
+	  *(boffset +  3) = -ctemp04;
+	  *(boffset +  4) = -ctemp05;
+	  *(boffset +  5) = -ctemp06;
+	  *(boffset +  6) = -ctemp07;
+	  *(boffset +  7) = -ctemp08;
+
+	  *(boffset +  8) = -ctemp09;
+	  *(boffset +  9) = -ctemp10;
+	  *(boffset + 10) = -ctemp11;
+	  *(boffset + 11) = -ctemp12;
+	  *(boffset + 12) = -ctemp13;
+	  *(boffset + 13) = -ctemp14;
+	  *(boffset + 14) = -ctemp15;
+	  *(boffset + 15) = -ctemp16;
+
+	  *(boffset + 16) = -ctemp17;
+	  *(boffset + 17) = -ctemp18;
+	  *(boffset + 18) = -ctemp19;
+	  *(boffset + 19) = -ctemp20;
+	  *(boffset + 20) = -ctemp21;
+	  *(boffset + 21) = -ctemp22;
+	  *(boffset + 22) = -ctemp23;
+	  *(boffset + 23) = -ctemp24;
+
+	  *(boffset + 24) = -ctemp25;
+	  *(boffset + 25) = -ctemp26;
+	  *(boffset + 26) = -ctemp27;
+	  *(boffset + 27) = -ctemp28;
+	  *(boffset + 28) = -ctemp29;
+	  *(boffset + 29) = -ctemp30;
+	  *(boffset + 30) = -ctemp31;
+	  *(boffset + 31) = -ctemp32;
+
+	  *(boffset + 32) = -ctemp33;
+	  *(boffset + 33) = -ctemp34;
+	  *(boffset + 34) = -ctemp35;
+	  *(boffset + 35) = -ctemp36;
+	  *(boffset + 36) = -ctemp37;
+	  *(boffset + 37) = -ctemp38;
+	  *(boffset + 38) = -ctemp39;
+	  *(boffset + 39) = -ctemp40;
+
+	  *(boffset + 40) = -ctemp41;
+	  *(boffset + 41) = -ctemp42;
+	  *(boffset + 42) = -ctemp43;
+	  *(boffset + 43) = -ctemp44;
+	  *(boffset + 44) = -ctemp45;
+	  *(boffset + 45) = -ctemp46;
+	  *(boffset + 46) = -ctemp47;
+	  *(boffset + 47) = -ctemp48;
+
+	  *(boffset + 48) = -ctemp49;
+	  *(boffset + 49) = -ctemp50;
+	  *(boffset + 50) = -ctemp51;
+	  *(boffset + 51) = -ctemp52;
+	  *(boffset + 52) = -ctemp53;
+	  *(boffset + 53) = -ctemp54;
+	  *(boffset + 54) = -ctemp55;
+	  *(boffset + 55) = -ctemp56;
+
+	  *(boffset + 56) = -ctemp57;
+	  *(boffset + 57) = -ctemp58;
+	  *(boffset + 58) = -ctemp59;
+	  *(boffset + 59) = -ctemp60;
+	  *(boffset + 60) = -ctemp61;
+	  *(boffset + 61) = -ctemp62;
+	  *(boffset + 62) = -ctemp63;
+	  *(boffset + 63) = -ctemp64;
+
+	  aoffset1 +=  2 * lda;
+	  aoffset2 +=  2 * lda;
+	  boffset   += 64;
+
+	  i --;
+	}while(i > 0);
+      }
+
+      if (m & 1){
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset1 +  2);
+	ctemp04 = *(aoffset1 +  3);
+	ctemp05 = *(aoffset1 +  4);
+	ctemp06 = *(aoffset1 +  5);
+	ctemp07 = *(aoffset1 +  6);
+	ctemp08 = *(aoffset1 +  7);
+	ctemp09 = *(aoffset1 +  8);
+	ctemp10 = *(aoffset1 +  9);
+	ctemp11 = *(aoffset1 + 10);
+	ctemp12 = *(aoffset1 + 11);
+	ctemp13 = *(aoffset1 + 12);
+	ctemp14 = *(aoffset1 + 13);
+	ctemp15 = *(aoffset1 + 14);
+	ctemp16 = *(aoffset1 + 15);
+	ctemp17 = *(aoffset1 + 16);
+	ctemp18 = *(aoffset1 + 17);
+	ctemp19 = *(aoffset1 + 18);
+	ctemp20 = *(aoffset1 + 19);
+	ctemp21 = *(aoffset1 + 20);
+	ctemp22 = *(aoffset1 + 21);
+	ctemp23 = *(aoffset1 + 22);
+	ctemp24 = *(aoffset1 + 23);
+	ctemp25 = *(aoffset1 + 24);
+	ctemp26 = *(aoffset1 + 25);
+	ctemp27 = *(aoffset1 + 26);
+	ctemp28 = *(aoffset1 + 27);
+	ctemp29 = *(aoffset1 + 28);
+	ctemp30 = *(aoffset1 + 29);
+	ctemp31 = *(aoffset1 + 30);
+	ctemp32 = *(aoffset1 + 31);
+
+	*(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+	*(boffset +  4) = -ctemp05;
+	*(boffset +  5) = -ctemp06;
+	*(boffset +  6) = -ctemp07;
+	*(boffset +  7) = -ctemp08;
+
+	*(boffset +  8) = -ctemp09;
+	*(boffset +  9) = -ctemp10;
+	*(boffset + 10) = -ctemp11;
+	*(boffset + 11) = -ctemp12;
+	*(boffset + 12) = -ctemp13;
+	*(boffset + 13) = -ctemp14;
+	*(boffset + 14) = -ctemp15;
+	*(boffset + 15) = -ctemp16;
+
+	*(boffset + 16) = -ctemp17;
+	*(boffset + 17) = -ctemp18;
+	*(boffset + 18) = -ctemp19;
+	*(boffset + 19) = -ctemp20;
+	*(boffset + 20) = -ctemp21;
+	*(boffset + 21) = -ctemp22;
+	*(boffset + 22) = -ctemp23;
+	*(boffset + 23) = -ctemp24;
+
+	*(boffset + 24) = -ctemp25;
+	*(boffset + 25) = -ctemp26;
+	*(boffset + 26) = -ctemp27;
+	*(boffset + 27) = -ctemp28;
+	*(boffset + 28) = -ctemp29;
+	*(boffset + 29) = -ctemp30;
+	*(boffset + 30) = -ctemp31;
+	*(boffset + 31) = -ctemp32;
+
+	boffset   += 32;
+      }
+
+      j--;
+    }while(j > 0);
+  } /* end of if(j > 0) */
+
+  if (n & 8){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset + lda;
+    aoffset += 16;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	  ctemp01 = *(aoffset1 +  0);
+	  ctemp02 = *(aoffset1 +  1);
+	  ctemp03 = *(aoffset1 +  2);
+	  ctemp04 = *(aoffset1 +  3);
+	  ctemp05 = *(aoffset1 +  4);
+	  ctemp06 = *(aoffset1 +  5);
+	  ctemp07 = *(aoffset1 +  6);
+	  ctemp08 = *(aoffset1 +  7);
+	  ctemp09 = *(aoffset1 +  8);
+	  ctemp10 = *(aoffset1 +  9);
+	  ctemp11 = *(aoffset1 + 10);
+	  ctemp12 = *(aoffset1 + 11);
+	  ctemp13 = *(aoffset1 + 12);
+	  ctemp14 = *(aoffset1 + 13);
+	  ctemp15 = *(aoffset1 + 14);
+	  ctemp16 = *(aoffset1 + 15);
+
+	  ctemp17 = *(aoffset2 +  0);
+	  ctemp18 = *(aoffset2 +  1);
+	  ctemp19 = *(aoffset2 +  2);
+	  ctemp20 = *(aoffset2 +  3);
+	  ctemp21 = *(aoffset2 +  4);
+	  ctemp22 = *(aoffset2 +  5);
+	  ctemp23 = *(aoffset2 +  6);
+	  ctemp24 = *(aoffset2 +  7);
+	  ctemp25 = *(aoffset2 +  8);
+	  ctemp26 = *(aoffset2 +  9);
+	  ctemp27 = *(aoffset2 + 10);
+	  ctemp28 = *(aoffset2 + 11);
+	  ctemp29 = *(aoffset2 + 12);
+	  ctemp30 = *(aoffset2 + 13);
+	  ctemp31 = *(aoffset2 + 14);
+	  ctemp32 = *(aoffset2 + 15);
+
+	  *(boffset +  0) = -ctemp01;
+	  *(boffset +  1) = -ctemp02;
+	  *(boffset +  2) = -ctemp03;
+	  *(boffset +  3) = -ctemp04;
+	  *(boffset +  4) = -ctemp05;
+	  *(boffset +  5) = -ctemp06;
+	  *(boffset +  6) = -ctemp07;
+	  *(boffset +  7) = -ctemp08;
+
+	  *(boffset +  8) = -ctemp09;
+	  *(boffset +  9) = -ctemp10;
+	  *(boffset + 10) = -ctemp11;
+	  *(boffset + 11) = -ctemp12;
+	  *(boffset + 12) = -ctemp13;
+	  *(boffset + 13) = -ctemp14;
+	  *(boffset + 14) = -ctemp15;
+	  *(boffset + 15) = -ctemp16;
+
+	  *(boffset + 16) = -ctemp17;
+	  *(boffset + 17) = -ctemp18;
+	  *(boffset + 18) = -ctemp19;
+	  *(boffset + 19) = -ctemp20;
+	  *(boffset + 20) = -ctemp21;
+	  *(boffset + 21) = -ctemp22;
+	  *(boffset + 22) = -ctemp23;
+	  *(boffset + 23) = -ctemp24;
+
+	  *(boffset + 24) = -ctemp25;
+	  *(boffset + 25) = -ctemp26;
+	  *(boffset + 26) = -ctemp27;
+	  *(boffset + 27) = -ctemp28;
+	  *(boffset + 28) = -ctemp29;
+	  *(boffset + 29) = -ctemp30;
+	  *(boffset + 30) = -ctemp31;
+	  *(boffset + 31) = -ctemp32;
+
+	aoffset1 +=  2 * lda;
+	aoffset2 +=  2 * lda;
+	boffset   += 32;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+    ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset1 +  2);
+	ctemp04 = *(aoffset1 +  3);
+	ctemp05 = *(aoffset1 +  4);
+	ctemp06 = *(aoffset1 +  5);
+	ctemp07 = *(aoffset1 +  6);
+	ctemp08 = *(aoffset1 +  7);
+	ctemp09 = *(aoffset1 +  8);
+	ctemp10 = *(aoffset1 +  9);
+	ctemp11 = *(aoffset1 + 10);
+	ctemp12 = *(aoffset1 + 11);
+	ctemp13 = *(aoffset1 + 12);
+	ctemp14 = *(aoffset1 + 13);
+	ctemp15 = *(aoffset1 + 14);
+	ctemp16 = *(aoffset1 + 15);
+
+    *(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+	*(boffset +  4) = -ctemp05;
+	*(boffset +  5) = -ctemp06;
+	*(boffset +  6) = -ctemp07;
+	*(boffset +  7) = -ctemp08;
+
+	*(boffset +  8) = -ctemp09;
+	*(boffset +  9) = -ctemp10;
+	*(boffset + 10) = -ctemp11;
+	*(boffset + 11) = -ctemp12;
+	*(boffset + 12) = -ctemp13;
+	*(boffset + 13) = -ctemp14;
+	*(boffset + 14) = -ctemp15;
+	*(boffset + 15) = -ctemp16;
+
+      boffset   += 16;
+    }
+  }
+
+  if (n & 4){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset + lda;
+    aoffset += 8;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset1 +  2);
+	ctemp04 = *(aoffset1 +  3);
+	ctemp05 = *(aoffset1 +  4);
+	ctemp06 = *(aoffset1 +  5);
+	ctemp07 = *(aoffset1 +  6);
+	ctemp08 = *(aoffset1 +  7);
+
+	ctemp09 = *(aoffset2 +  0);
+	ctemp10 = *(aoffset2 +  1);
+	ctemp11 = *(aoffset2 +  2);
+	ctemp12 = *(aoffset2 +  3);
+	ctemp13 = *(aoffset2 +  4);
+	ctemp14 = *(aoffset2 +  5);
+	ctemp15 = *(aoffset2 +  6);
+	ctemp16 = *(aoffset2 +  7);
+
+	*(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+	*(boffset +  4) = -ctemp05;
+	*(boffset +  5) = -ctemp06;
+	*(boffset +  6) = -ctemp07;
+	*(boffset +  7) = -ctemp08;
+
+	*(boffset +  8) = -ctemp09;
+	*(boffset +  9) = -ctemp10;
+	*(boffset + 10) = -ctemp11;
+	*(boffset + 11) = -ctemp12;
+	*(boffset + 12) = -ctemp13;
+	*(boffset + 13) = -ctemp14;
+	*(boffset + 14) = -ctemp15;
+	*(boffset + 15) = -ctemp16;
+
+	aoffset1 +=  2 * lda;
+	aoffset2 +=  2 * lda;
+	boffset   += 16;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp02 = *(aoffset1 +  1);
+      ctemp03 = *(aoffset1 +  2);
+      ctemp04 = *(aoffset1 +  3);
+      ctemp05 = *(aoffset1 +  4);
+      ctemp06 = *(aoffset1 +  5);
+      ctemp07 = *(aoffset1 +  6);
+      ctemp08 = *(aoffset1 +  7);
+
+      *(boffset +  0) = -ctemp01;
+      *(boffset +  1) = -ctemp02;
+      *(boffset +  2) = -ctemp03;
+      *(boffset +  3) = -ctemp04;
+      *(boffset +  4) = -ctemp05;
+      *(boffset +  5) = -ctemp06;
+      *(boffset +  6) = -ctemp07;
+      *(boffset +  7) = -ctemp08;
+
+      boffset   += 8;
+    }
+  }
+
+  if (n & 2){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset + lda;
+    aoffset += 4;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset1 +  2);
+	ctemp04 = *(aoffset1 +  3);
+
+	ctemp05 = *(aoffset2 +  0);
+	ctemp06 = *(aoffset2 +  1);
+	ctemp07 = *(aoffset2 +  2);
+	ctemp08 = *(aoffset2 +  3);
+
+	*(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+	*(boffset +  4) = -ctemp05;
+	*(boffset +  5) = -ctemp06;
+	*(boffset +  6) = -ctemp07;
+	*(boffset +  7) = -ctemp08;
+
+	aoffset1 +=  2 * lda;
+	aoffset2 +=  2 * lda;
+	boffset   += 8;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp02 = *(aoffset1 +  1);
+      ctemp03 = *(aoffset1 +  2);
+      ctemp04 = *(aoffset1 +  3);
+
+      *(boffset +  0) = -ctemp01;
+      *(boffset +  1) = -ctemp02;
+      *(boffset +  2) = -ctemp03;
+      *(boffset +  3) = -ctemp04;
+
+      boffset   += 4;
+    }
+  }
+
+  if (n & 1){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset + lda;
+    // aoffset += 2;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset2 +  0);
+	ctemp04 = *(aoffset2 +  1);
+
+	*(boffset +  0) = -ctemp01;
+	*(boffset +  1) = -ctemp02;
+	*(boffset +  2) = -ctemp03;
+	*(boffset +  3) = -ctemp04;
+
+	aoffset1 +=  2 * lda;
+	aoffset2 +=  2 * lda;
+	boffset   += 4;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp02 = *(aoffset1 +  1);
+
+      *(boffset +  0) = -ctemp01;
+      *(boffset +  1) = -ctemp02;
+      // boffset   += 2;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/zsymm_lcopy_16.c b/kernel/generic/zsymm_lcopy_16.c
new file mode 100644
index 000000000..b32374a5e
--- /dev/null
+++ b/kernel/generic/zsymm_lcopy_16.c
@@ -0,0 +1,333 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js, offset;
+
+  FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
+  FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
+  FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
+  FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
+
+  FLOAT *ao1, *ao2,  *ao3,  *ao4,  *ao5,  *ao6,  *ao7,  *ao8;
+  FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
+
+  lda *= 2;
+
+  js = (n >> 4);
+  while (js > 0){
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+    if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
+    if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
+    if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
+    if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
+    if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
+    if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
+    if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
+    if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
+    if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
+    if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
+    if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
+    if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
+
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+      data17 = *(ao9 + 0);
+      data18 = *(ao9 + 1);
+      data19 = *(ao10 + 0);
+      data20 = *(ao10 + 1);
+      data21 = *(ao11 + 0);
+      data22 = *(ao11 + 1);
+      data23 = *(ao12 + 0);
+      data24 = *(ao12 + 1);
+      data25 = *(ao13 + 0);
+      data26 = *(ao13 + 1);
+      data27 = *(ao14 + 0);
+      data28 = *(ao14 + 1);
+      data29 = *(ao15 + 0);
+      data30 = *(ao15 + 1);
+      data31 = *(ao16 + 0);
+      data32 = *(ao16 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+      if (offset >  -4) ao5 += lda; else ao5 += 2;
+      if (offset >  -5) ao6 += lda; else ao6 += 2;
+      if (offset >  -6) ao7 += lda; else ao7 += 2;
+      if (offset >  -7) ao8 += lda; else ao8 += 2;
+      if (offset >  -8) ao9 += lda; else ao9 += 2;
+      if (offset >  -9) ao10 += lda; else ao10 += 2;
+      if (offset >  -10) ao11 += lda; else ao11 += 2;
+      if (offset >  -11) ao12 += lda; else ao12 += 2;
+      if (offset >  -12) ao13 += lda; else ao13 += 2;
+      if (offset >  -13) ao14 += lda; else ao14 += 2;
+      if (offset >  -14) ao15 += lda; else ao15 += 2;
+      if (offset >  -15) ao16 += lda; else ao16 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+      b[ 8] = data09;
+      b[ 9] = data10;
+      b[10] = data11;
+      b[11] = data12;
+      b[12] = data13;
+      b[13] = data14;
+      b[14] = data15;
+      b[15] = data16;
+      b[16] = data17;
+      b[17] = data18;
+      b[18] = data19;
+      b[19] = data20;
+      b[20] = data21;
+      b[21] = data22;
+      b[22] = data23;
+      b[23] = data24;
+      b[24] = data25;
+      b[25] = data26;
+      b[26] = data27;
+      b[27] = data28;
+      b[28] = data29;
+      b[29] = data30;
+      b[30] = data31;
+      b[31] = data32;
+
+      b += 32;
+
+      offset --;
+      i --;
+    }
+
+    posX += 16;
+    js --;
+  }
+
+  if (n & 8) {
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+    if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
+    if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
+    if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
+    if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+      if (offset >  -4) ao5 += lda; else ao5 += 2;
+      if (offset >  -5) ao6 += lda; else ao6 += 2;
+      if (offset >  -6) ao7 += lda; else ao7 += 2;
+      if (offset >  -7) ao8 += lda; else ao8 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+      b[ 8] = data09;
+      b[ 9] = data10;
+      b[10] = data11;
+      b[11] = data12;
+      b[12] = data13;
+      b[13] = data14;
+      b[14] = data15;
+      b[15] = data16;
+
+      b += 16;
+
+      offset --;
+      i --;
+    }
+
+    posX += 8;
+  }
+
+  if (n & 4) {
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+    if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
+    if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+      if (offset >  -2) ao3 += lda; else ao3 += 2;
+      if (offset >  -3) ao4 += lda; else ao4 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+
+      b += 8;
+
+      offset --;
+      i --;
+    }
+
+    posX += 4;
+  }
+
+  if (n & 2) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+    if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+      if (offset >  -1) ao2 += lda; else ao2 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+
+      b += 4;
+
+      offset --;
+      i --;
+    }
+
+    posX += 2;
+
+  }
+
+  if (n & 1) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+
+      if (offset >   0) ao1 += lda; else ao1 += 2;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+
+      b += 2;
+
+      offset --;
+      i --;
+    }
+
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/zsymm_ucopy_16.c b/kernel/generic/zsymm_ucopy_16.c
new file mode 100644
index 000000000..cb19bea47
--- /dev/null
+++ b/kernel/generic/zsymm_ucopy_16.c
@@ -0,0 +1,332 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js, offset;
+
+  FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
+  FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
+  FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
+  FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
+
+  FLOAT *ao1, *ao2,  *ao3,  *ao4,  *ao5,  *ao6,  *ao7,  *ao8;
+  FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
+
+  lda *= 2;
+
+  js = (n >> 4);
+  while (js > 0){
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+    if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
+    if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
+    if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
+    if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
+    if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda;
+    if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda;
+    if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda;
+    if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda;
+    if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda;
+    if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda;
+    if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda;
+    if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+      data17 = *(ao9 + 0);
+      data18 = *(ao9 + 1);
+      data19 = *(ao10 + 0);
+      data20 = *(ao10 + 1);
+      data21 = *(ao11 + 0);
+      data22 = *(ao11 + 1);
+      data23 = *(ao12 + 0);
+      data24 = *(ao12 + 1);
+      data25 = *(ao13 + 0);
+      data26 = *(ao13 + 1);
+      data27 = *(ao14 + 0);
+      data28 = *(ao14 + 1);
+      data29 = *(ao15 + 0);
+      data30 = *(ao15 + 1);
+      data31 = *(ao16 + 0);
+      data32 = *(ao16 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+      if (offset >  -4) ao5 += 2; else ao5 += lda;
+      if (offset >  -5) ao6 += 2; else ao6 += lda;
+      if (offset >  -6) ao7 += 2; else ao7 += lda;
+      if (offset >  -7) ao8 += 2; else ao8 += lda;
+      if (offset >  -8) ao9 += 2; else ao9 += lda;
+      if (offset >  -9) ao10 += 2; else ao10 += lda;
+      if (offset >  -10) ao11 += 2; else ao11 += lda;
+      if (offset >  -11) ao12 += 2; else ao12 += lda;
+      if (offset >  -12) ao13 += 2; else ao13 += lda;
+      if (offset >  -13) ao14 += 2; else ao14 += lda;
+      if (offset >  -14) ao15 += 2; else ao15 += lda;
+      if (offset >  -15) ao16 += 2; else ao16 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+      b[ 8] = data09;
+      b[ 9] = data10;
+      b[10] = data11;
+      b[11] = data12;
+      b[12] = data13;
+      b[13] = data14;
+      b[14] = data15;
+      b[15] = data16;
+      b[16] = data17;
+      b[17] = data18;
+      b[18] = data19;
+      b[19] = data20;
+      b[20] = data21;
+      b[21] = data22;
+      b[22] = data23;
+      b[23] = data24;
+      b[24] = data25;
+      b[25] = data26;
+      b[26] = data27;
+      b[27] = data28;
+      b[28] = data29;
+      b[29] = data30;
+      b[30] = data31;
+      b[31] = data32;
+
+      b += 32;
+
+      offset --;
+      i --;
+    }
+
+    posX += 16;
+    js --;
+  }
+
+  if (n & 8) {
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+    if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
+    if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
+    if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
+    if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+      data09 = *(ao5 + 0);
+      data10 = *(ao5 + 1);
+      data11 = *(ao6 + 0);
+      data12 = *(ao6 + 1);
+      data13 = *(ao7 + 0);
+      data14 = *(ao7 + 1);
+      data15 = *(ao8 + 0);
+      data16 = *(ao8 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+      if (offset >  -4) ao5 += 2; else ao5 += lda;
+      if (offset >  -5) ao6 += 2; else ao6 += lda;
+      if (offset >  -6) ao7 += 2; else ao7 += lda;
+      if (offset >  -7) ao8 += 2; else ao8 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+      b[ 8] = data09;
+      b[ 9] = data10;
+      b[10] = data11;
+      b[11] = data12;
+      b[12] = data13;
+      b[13] = data14;
+      b[14] = data15;
+      b[15] = data16;
+
+      b += 16;
+
+      offset --;
+      i --;
+    }
+
+    posX += 8;
+  }
+
+  if (n & 4) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+    if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
+    if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+      data05 = *(ao3 + 0);
+      data06 = *(ao3 + 1);
+      data07 = *(ao4 + 0);
+      data08 = *(ao4 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+      if (offset >  -2) ao3 += 2; else ao3 += lda;
+      if (offset >  -3) ao4 += 2; else ao4 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+      b[ 4] = data05;
+      b[ 5] = data06;
+      b[ 6] = data07;
+      b[ 7] = data08;
+
+      b += 8;
+
+      offset --;
+      i --;
+    }
+
+    posX += 4;
+  }
+
+  if (n & 2) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+    if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+      data03 = *(ao2 + 0);
+      data04 = *(ao2 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+      if (offset >  -1) ao2 += 2; else ao2 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+      b[ 2] = data03;
+      b[ 3] = data04;
+
+      b += 4;
+
+      offset --;
+      i --;
+    }
+
+    posX += 2;
+  }
+
+  if (n & 1) {
+
+    offset = posX - posY;
+
+    if (offset >  0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
+
+    i     = m;
+
+    while (i > 0) {
+      data01 = *(ao1 + 0);
+      data02 = *(ao1 + 1);
+
+      if (offset >   0) ao1 += 2; else ao1 += lda;
+
+      b[ 0] = data01;
+      b[ 1] = data02;
+
+      b += 2;
+
+      offset --;
+      i --;
+    }
+
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrmm_lncopy_16.c b/kernel/generic/ztrmm_lncopy_16.c
new file mode 100644
index 000000000..d7fb23176
--- /dev/null
+++ b/kernel/generic/ztrmm_lncopy_16.c
@@ -0,0 +1,2310 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js;
+  BLASLONG X, ii;
+
+  FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08;
+  FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  lda += lda;
+
+  js = (n >> 4);
+
+  if (js > 0){
+    do {
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+	a09 = a + posY * 2 + (posX + 8) * lda;
+	a10 = a + posY * 2 + (posX + 9) * lda;
+	a11 = a + posY * 2 + (posX + 10) * lda;
+	a12 = a + posY * 2 + (posX + 11) * lda;
+	a13 = a + posY * 2 + (posX + 12) * lda;
+	a14 = a + posY * 2 + (posX + 13) * lda;
+	a15 = a + posY * 2 + (posX + 14) * lda;
+	a16 = a + posY * 2 + (posX + 15) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+	a09 = a + posX * 2 + (posY + 8) * lda;
+	a10 = a + posX * 2 + (posY + 9) * lda;
+	a11 = a + posX * 2 + (posY + 10) * lda;
+	a12 = a + posX * 2 + (posY + 11) * lda;
+	a13 = a + posX * 2 + (posY + 12) * lda;
+	a14 = a + posX * 2 + (posY + 13) * lda;
+	a15 = a + posX * 2 + (posY + 14) * lda;
+	a16 = a + posX * 2 + (posY + 15) * lda;
+      }
+
+      i = (m >> 4);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    for (ii = 0; ii < 16; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+
+		  b[ 16] = *(a09 +  0);
+	      b[ 17] = *(a09 +  1);
+	      b[ 18] = *(a10 +  0);
+	      b[ 19] = *(a10 +  1);
+	      b[ 20] = *(a11 +  0);
+	      b[ 21] = *(a11 +  1);
+	      b[ 22] = *(a12 +  0);
+	      b[ 23] = *(a12 +  1);
+
+	      b[ 24] = *(a13 +  0);
+	      b[ 25] = *(a13 +  1);
+	      b[ 26] = *(a14 +  0);
+	      b[ 27] = *(a14 +  1);
+	      b[ 28] = *(a15 +  0);
+	      b[ 29] = *(a15 +  1);
+	      b[ 30] = *(a16 +  0);
+	      b[ 31] = *(a16 +  1);
+
+	      a01 += 2;
+	      a02 += 2;
+	      a03 += 2;
+	      a04 += 2;
+	      a05 += 2;
+	      a06 += 2;
+	      a07 += 2;
+	      a08 += 2;
+		  a09 += 2;
+	      a10 += 2;
+	      a11 += 2;
+	      a12 += 2;
+	      a13 += 2;
+	      a14 += 2;
+	      a15 += 2;
+	      a16 += 2;
+	      b += 32;
+	    }
+	  } else
+	    if (X < posY) {
+	      a01 += 16 * lda;
+	      a02 += 16 * lda;
+	      a03 += 16 * lda;
+	      a04 += 16 * lda;
+	      a05 += 16 * lda;
+	      a06 += 16 * lda;
+	      a07 += 16 * lda;
+	      a08 += 16 * lda;
+		  a09 += 16 * lda;
+	      a10 += 16 * lda;
+	      a11 += 16 * lda;
+	      a12 += 16 * lda;
+	      a13 += 16 * lda;
+	      a14 += 16 * lda;
+	      a15 += 16 * lda;
+	      a16 += 16 * lda;
+
+	      b += 512;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+		  b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+		  b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+
+	      b[ 32] = *(a01 +  2);
+	      b[ 33] = *(a01 +  3);
+#ifdef UNIT
+	      b[ 34] = ONE;
+	      b[ 35] = ZERO;
+#else
+	      b[ 34] = *(a02 +  2);
+	      b[ 35] = *(a02 +  3);
+#endif
+	      b[ 36] = ZERO;
+	      b[ 37] = ZERO;
+	      b[ 38] = ZERO;
+	      b[ 39] = ZERO;
+	      b[ 40] = ZERO;
+	      b[ 41] = ZERO;
+	      b[ 42] = ZERO;
+	      b[ 43] = ZERO;
+	      b[ 44] = ZERO;
+	      b[ 45] = ZERO;
+	      b[ 46] = ZERO;
+	      b[ 47] = ZERO;
+	      b[ 48] = ZERO;
+	      b[ 49] = ZERO;
+	      b[ 50] = ZERO;
+	      b[ 51] = ZERO;
+	      b[ 52] = ZERO;
+	      b[ 53] = ZERO;
+	      b[ 54] = ZERO;
+	      b[ 55] = ZERO;
+	      b[ 56] = ZERO;
+	      b[ 57] = ZERO;
+	      b[ 58] = ZERO;
+	      b[ 59] = ZERO;
+	      b[ 60] = ZERO;
+	      b[ 61] = ZERO;
+	      b[ 62] = ZERO;
+	      b[ 63] = ZERO;
+
+	      b[ 64] = *(a01 +  4);
+	      b[ 65] = *(a01 +  5);
+	      b[ 66] = *(a02 +  4);
+	      b[ 67] = *(a02 +  5);
+#ifdef UNIT
+	      b[ 68] = ONE;
+	      b[ 69] = ZERO;
+#else
+	      b[ 68] = *(a03 +  4);
+	      b[ 69] = *(a03 +  5);
+#endif
+	      b[ 70] = ZERO;
+	      b[ 71] = ZERO;
+	      b[ 72] = ZERO;
+	      b[ 73] = ZERO;
+	      b[ 74] = ZERO;
+	      b[ 75] = ZERO;
+	      b[ 76] = ZERO;
+	      b[ 77] = ZERO;
+	      b[ 78] = ZERO;
+	      b[ 79] = ZERO;
+	      b[ 80] = ZERO;
+	      b[ 81] = ZERO;
+	      b[ 82] = ZERO;
+		  b[ 83] = ZERO;
+	      b[ 84] = ZERO;
+	      b[ 85] = ZERO;
+	      b[ 86] = ZERO;
+	      b[ 87] = ZERO;
+	      b[ 88] = ZERO;
+	      b[ 89] = ZERO;
+	      b[ 90] = ZERO;
+	      b[ 91] = ZERO;
+	      b[ 92] = ZERO;
+	      b[ 93] = ZERO;
+	      b[ 94] = ZERO;
+	      b[ 95] = ZERO;
+
+	      b[ 96] = *(a01 +  6);
+	      b[ 97] = *(a01 +  7);
+	      b[ 98] = *(a02 +  6);
+	      b[ 99] = *(a02 +  7);
+	      b[100] = *(a03 +  6);
+	      b[101] = *(a03 +  7);
+#ifdef UNIT
+	      b[102] = ONE;
+	      b[103] = ZERO;
+#else
+	      b[102] = *(a04 +  6);
+	      b[103] = *(a04 +  7);
+#endif
+	      b[104] = ZERO;
+	      b[105] = ZERO;
+	      b[106] = ZERO;
+	      b[107] = ZERO;
+	      b[108] = ZERO;
+	      b[109] = ZERO;
+	      b[110] = ZERO;
+	      b[111] = ZERO;
+	      b[112] = ZERO;
+	      b[113] = ZERO;
+	      b[114] = ZERO;
+	      b[115] = ZERO;
+		  b[116] = ZERO;
+	      b[117] = ZERO;
+	      b[118] = ZERO;
+	      b[119] = ZERO;
+	      b[120] = ZERO;
+	      b[121] = ZERO;
+	      b[122] = ZERO;
+	      b[123] = ZERO;
+	      b[124] = ZERO;
+	      b[125] = ZERO;
+	      b[126] = ZERO;
+	      b[127] = ZERO;
+
+	      b[128] = *(a01 +  8);
+	      b[129] = *(a01 +  9);
+	      b[130] = *(a02 +  8);
+	      b[131] = *(a02 +  9);
+	      b[132] = *(a03 +  8);
+	      b[133] = *(a03 +  9);
+	      b[134] = *(a04 +  8);
+	      b[135] = *(a04 +  9);
+#ifdef UNIT
+	      b[136] = ONE;
+	      b[137] = ZERO;
+#else
+	      b[136] = *(a05 +  8);
+	      b[137] = *(a05 +  9);
+#endif
+	      b[138] = ZERO;
+	      b[139] = ZERO;
+	      b[140] = ZERO;
+	      b[141] = ZERO;
+	      b[142] = ZERO;
+	      b[143] = ZERO;
+	      b[144] = ZERO;
+	      b[145] = ZERO;
+	      b[146] = ZERO;
+	      b[147] = ZERO;
+	      b[148] = ZERO;
+		  b[149] = ZERO;
+	      b[150] = ZERO;
+	      b[151] = ZERO;
+	      b[152] = ZERO;
+	      b[153] = ZERO;
+	      b[154] = ZERO;
+	      b[155] = ZERO;
+	      b[156] = ZERO;
+	      b[157] = ZERO;
+	      b[158] = ZERO;
+	      b[159] = ZERO;
+
+	      b[160] = *(a01 + 10);
+	      b[161] = *(a01 + 11);
+	      b[162] = *(a02 + 10);
+	      b[163] = *(a02 + 11);
+	      b[164] = *(a03 + 10);
+	      b[165] = *(a03 + 11);
+	      b[166] = *(a04 + 10);
+	      b[167] = *(a04 + 11);
+	      b[168] = *(a05 + 10);
+	      b[169] = *(a05 + 11);
+#ifdef UNIT
+	      b[170] = ONE;
+	      b[171] = ZERO;
+#else
+	      b[170] = *(a06 + 10);
+	      b[171] = *(a06 + 11);
+#endif
+	      b[172] = ZERO;
+	      b[173] = ZERO;
+	      b[174] = ZERO;
+	      b[175] = ZERO;
+	      b[176] = ZERO;
+	      b[177] = ZERO;
+	      b[178] = ZERO;
+	      b[179] = ZERO;
+	      b[180] = ZERO;
+	      b[181] = ZERO;
+		  b[182] = ZERO;
+	      b[183] = ZERO;
+	      b[184] = ZERO;
+	      b[185] = ZERO;
+	      b[186] = ZERO;
+	      b[187] = ZERO;
+	      b[188] = ZERO;
+	      b[189] = ZERO;
+	      b[190] = ZERO;
+	      b[191] = ZERO;
+
+	      b[192] = *(a01 + 12);
+	      b[193] = *(a01 + 13);
+	      b[194] = *(a02 + 12);
+	      b[195] = *(a02 + 13);
+	      b[196] = *(a03 + 12);
+	      b[197] = *(a03 + 13);
+	      b[198] = *(a04 + 12);
+	      b[199] = *(a04 + 13);
+	      b[200] = *(a05 + 12);
+	      b[201] = *(a05 + 13);
+	      b[202] = *(a06 + 12);
+	      b[203] = *(a06 + 13);
+#ifdef UNIT
+	      b[204] = ONE;
+	      b[205] = ZERO;
+#else
+	      b[204] = *(a07 + 12);
+	      b[205] = *(a07 + 13);
+#endif
+	      b[206] = ZERO;
+	      b[207] = ZERO;
+	      b[208] = ZERO;
+	      b[209] = ZERO;
+	      b[210] = ZERO;
+	      b[211] = ZERO;
+	      b[212] = ZERO;
+	      b[213] = ZERO;
+	      b[214] = ZERO;
+		  b[215] = ZERO;
+	      b[216] = ZERO;
+	      b[217] = ZERO;
+	      b[218] = ZERO;
+	      b[219] = ZERO;
+	      b[220] = ZERO;
+	      b[221] = ZERO;
+	      b[222] = ZERO;
+	      b[223] = ZERO;
+
+	      b[224] = *(a01 + 14);
+	      b[225] = *(a01 + 15);
+	      b[226] = *(a02 + 14);
+	      b[227] = *(a02 + 15);
+	      b[228] = *(a03 + 14);
+	      b[229] = *(a03 + 15);
+	      b[230] = *(a04 + 14);
+	      b[231] = *(a04 + 15);
+	      b[232] = *(a05 + 14);
+	      b[233] = *(a05 + 15);
+	      b[234] = *(a06 + 14);
+	      b[235] = *(a06 + 15);
+	      b[236] = *(a07 + 14);
+	      b[237] = *(a07 + 15);
+#ifdef UNIT
+	      b[238] = ONE;
+	      b[239] = ZERO;
+#else
+	      b[238] = *(a08 + 14);
+	      b[239] = *(a08 + 15);
+#endif
+		  b[240] = ZERO;
+	      b[241] = ZERO;
+	      b[242] = ZERO;
+	      b[243] = ZERO;
+	      b[244] = ZERO;
+	      b[245] = ZERO;
+	      b[246] = ZERO;
+	      b[247] = ZERO;
+		  b[248] = ZERO;
+	      b[249] = ZERO;
+	      b[250] = ZERO;
+	      b[251] = ZERO;
+	      b[252] = ZERO;
+	      b[253] = ZERO;
+	      b[254] = ZERO;
+	      b[255] = ZERO;
+
+	      b[256] = *(a01 + 16);
+	      b[257] = *(a01 + 17);
+	      b[258] = *(a02 + 16);
+	      b[259] = *(a02 + 17);
+	      b[260] = *(a03 + 16);
+	      b[261] = *(a03 + 17);
+	      b[262] = *(a04 + 16);
+	      b[263] = *(a04 + 17);
+	      b[264] = *(a05 + 16);
+	      b[265] = *(a05 + 17);
+	      b[266] = *(a06 + 16);
+	      b[267] = *(a06 + 17);
+	      b[268] = *(a07 + 16);
+	      b[269] = *(a07 + 17);
+	      b[270] = *(a08 + 16);
+	      b[271] = *(a08 + 17);
+#ifdef UNIT
+	      b[272] = ONE;
+		  b[273] = ZERO;
+#else
+	      b[272] = *(a09 + 16);
+		  b[273] = *(a09 + 17);
+#endif
+		  b[274] = ZERO;
+	      b[275] = ZERO;
+	      b[276] = ZERO;
+	      b[277] = ZERO;
+	      b[278] = ZERO;
+	      b[279] = ZERO;
+	      b[280] = ZERO;
+		  b[281] = ZERO;
+	      b[282] = ZERO;
+	      b[283] = ZERO;
+	      b[284] = ZERO;
+	      b[285] = ZERO;
+	      b[286] = ZERO;
+	      b[287] = ZERO;
+
+		  b[288] = *(a01 + 18);
+		  b[289] = *(a01 + 19);
+	      b[290] = *(a02 + 18);
+	      b[291] = *(a02 + 19);
+	      b[292] = *(a03 + 18);
+	      b[293] = *(a03 + 19);
+	      b[294] = *(a04 + 18);
+	      b[295] = *(a04 + 19);
+	      b[296] = *(a05 + 18);
+	      b[297] = *(a05 + 19);
+	      b[298] = *(a06 + 18);
+	      b[299] = *(a06 + 19);
+	      b[300] = *(a07 + 18);
+	      b[301] = *(a07 + 19);
+	      b[302] = *(a08 + 18);
+	      b[303] = *(a08 + 19);
+	      b[304] = *(a09 + 18);
+	      b[305] = *(a09 + 19);
+#ifdef UNIT
+	      b[306] = ONE;
+		  b[307] = ZERO;
+#else
+	      b[306] = *(a10 + 18);
+		  b[307] = *(a10 + 19);
+#endif
+		  b[308] = ZERO;
+	      b[309] = ZERO;
+	      b[310] = ZERO;
+	      b[311] = ZERO;
+	      b[312] = ZERO;
+	      b[313] = ZERO;
+		  b[314] = ZERO;
+	      b[315] = ZERO;
+	      b[316] = ZERO;
+	      b[317] = ZERO;
+	      b[318] = ZERO;
+	      b[319] = ZERO;
+
+		  b[320] = *(a01 + 20);
+		  b[321] = *(a01 + 21);
+	      b[322] = *(a02 + 20);
+	      b[323] = *(a02 + 21);
+	      b[324] = *(a03 + 20);
+	      b[325] = *(a03 + 21);
+	      b[326] = *(a04 + 20);
+	      b[327] = *(a04 + 21);
+	      b[328] = *(a05 + 20);
+	      b[329] = *(a05 + 21);
+	      b[330] = *(a06 + 20);
+	      b[331] = *(a06 + 21);
+	      b[332] = *(a07 + 20);
+	      b[333] = *(a07 + 21);
+	      b[334] = *(a08 + 20);
+	      b[335] = *(a08 + 21);
+	      b[336] = *(a09 + 20);
+	      b[337] = *(a09 + 21);
+	      b[338] = *(a10 + 20);
+	      b[339] = *(a10 + 21);
+#ifdef UNIT
+	      b[340] = ONE;
+	      b[341] = ZERO;
+#else
+	      b[340] = *(a11 + 20);
+	      b[341] = *(a11 + 21);
+#endif
+		  b[342] = ZERO;
+	      b[343] = ZERO;
+	      b[344] = ZERO;
+	      b[345] = ZERO;
+	      b[346] = ZERO;
+		  b[347] = ZERO;
+	      b[348] = ZERO;
+	      b[349] = ZERO;
+	      b[350] = ZERO;
+	      b[351] = ZERO;
+
+		  b[352] = *(a01 + 22);
+		  b[353] = *(a01 + 23);
+	      b[354] = *(a02 + 22);
+	      b[355] = *(a02 + 23);
+	      b[356] = *(a03 + 22);
+	      b[357] = *(a03 + 23);
+	      b[358] = *(a04 + 22);
+	      b[359] = *(a04 + 23);
+	      b[360] = *(a05 + 22);
+	      b[361] = *(a05 + 23);
+	      b[362] = *(a06 + 22);
+	      b[363] = *(a06 + 23);
+	      b[364] = *(a07 + 22);
+	      b[365] = *(a07 + 23);
+	      b[366] = *(a08 + 22);
+	      b[367] = *(a08 + 23);
+	      b[368] = *(a09 + 22);
+	      b[369] = *(a09 + 23);
+	      b[370] = *(a10 + 22);
+	      b[371] = *(a10 + 23);
+	      b[372] = *(a11 + 22);
+	      b[373] = *(a11 + 23);
+#ifdef UNIT
+	      b[374] = ONE;
+	      b[375] = ZERO;
+#else
+	      b[374] = *(a12 + 22);
+	      b[375] = *(a12 + 23);
+#endif
+		  b[376] = ZERO;
+	      b[377] = ZERO;
+	      b[378] = ZERO;
+	      b[379] = ZERO;
+		  b[380] = ZERO;
+	      b[381] = ZERO;
+	      b[382] = ZERO;
+	      b[383] = ZERO;
+
+		  b[384] = *(a01 + 24);
+		  b[385] = *(a01 + 25);
+	      b[386] = *(a02 + 24);
+	      b[387] = *(a02 + 25);
+	      b[388] = *(a03 + 24);
+	      b[389] = *(a03 + 25);
+	      b[390] = *(a04 + 24);
+	      b[391] = *(a04 + 25);
+	      b[392] = *(a05 + 24);
+	      b[393] = *(a05 + 25);
+	      b[394] = *(a06 + 24);
+	      b[395] = *(a06 + 25);
+	      b[396] = *(a07 + 24);
+	      b[397] = *(a07 + 25);
+	      b[398] = *(a08 + 24);
+	      b[399] = *(a08 + 25);
+	      b[400] = *(a09 + 24);
+	      b[401] = *(a09 + 25);
+	      b[402] = *(a10 + 24);
+	      b[403] = *(a10 + 25);
+	      b[404] = *(a11 + 24);
+	      b[405] = *(a11 + 25);
+	      b[406] = *(a12 + 24);
+	      b[407] = *(a12 + 25);
+#ifdef UNIT
+	      b[408] = ONE;
+	      b[409] = ZERO;
+#else
+	      b[408] = *(a13 + 24);
+	      b[409] = *(a13 + 25);
+#endif
+		  b[410] = ZERO;
+	      b[411] = ZERO;
+	      b[412] = ZERO;
+		  b[413] = ZERO;
+	      b[414] = ZERO;
+	      b[415] = ZERO;
+
+		  b[416] = *(a01 + 26);
+		  b[417] = *(a01 + 27);
+	      b[418] = *(a02 + 26);
+	      b[419] = *(a02 + 27);
+	      b[420] = *(a03 + 26);
+	      b[421] = *(a03 + 27);
+	      b[422] = *(a04 + 26);
+	      b[423] = *(a04 + 27);
+	      b[424] = *(a05 + 26);
+	      b[425] = *(a05 + 27);
+	      b[426] = *(a06 + 26);
+	      b[427] = *(a06 + 27);
+	      b[428] = *(a07 + 26);
+	      b[429] = *(a07 + 27);
+	      b[430] = *(a08 + 26);
+	      b[431] = *(a08 + 27);
+	      b[432] = *(a09 + 26);
+	      b[433] = *(a09 + 27);
+	      b[434] = *(a10 + 26);
+	      b[435] = *(a10 + 27);
+	      b[436] = *(a11 + 26);
+	      b[437] = *(a11 + 27);
+	      b[438] = *(a12 + 26);
+	      b[439] = *(a12 + 27);
+	      b[440] = *(a13 + 26);
+	      b[441] = *(a13 + 27);
+#ifdef UNIT
+	      b[442] = ONE;
+	      b[443] = ZERO;
+#else
+	      b[442] = *(a14 + 26);
+	      b[443] = *(a14 + 27);
+#endif
+		  b[444] = ZERO;
+	      b[445] = ZERO;
+		  b[446] = ZERO;
+	      b[447] = ZERO;
+
+		  b[448] = *(a01 + 28);
+		  b[449] = *(a01 + 29);
+	      b[450] = *(a02 + 28);
+	      b[451] = *(a02 + 29);
+	      b[452] = *(a03 + 28);
+	      b[453] = *(a03 + 29);
+	      b[454] = *(a04 + 28);
+	      b[455] = *(a04 + 29);
+	      b[456] = *(a05 + 28);
+	      b[457] = *(a05 + 29);
+	      b[458] = *(a06 + 28);
+	      b[459] = *(a06 + 29);
+	      b[460] = *(a07 + 28);
+	      b[461] = *(a07 + 29);
+	      b[462] = *(a08 + 28);
+	      b[463] = *(a08 + 29);
+	      b[464] = *(a09 + 28);
+	      b[465] = *(a09 + 29);
+	      b[466] = *(a10 + 28);
+	      b[467] = *(a10 + 29);
+	      b[468] = *(a11 + 28);
+	      b[469] = *(a11 + 29);
+	      b[470] = *(a12 + 28);
+	      b[471] = *(a12 + 29);
+	      b[472] = *(a13 + 28);
+	      b[473] = *(a13 + 29);
+	      b[474] = *(a14 + 28);
+	      b[475] = *(a14 + 29);
+#ifdef UNIT
+	      b[476] = ONE;
+	      b[477] = ZERO;
+#else
+	      b[476] = *(a15 + 28);
+	      b[477] = *(a15 + 29);
+#endif
+		  b[478] = ZERO;
+		  b[479] = ZERO;
+
+		  b[480] = *(a01 + 30);
+		  b[481] = *(a01 + 31);
+	      b[482] = *(a02 + 30);
+	      b[483] = *(a02 + 31);
+	      b[484] = *(a03 + 30);
+	      b[485] = *(a03 + 31);
+	      b[486] = *(a04 + 30);
+	      b[487] = *(a04 + 31);
+	      b[488] = *(a05 + 30);
+	      b[489] = *(a05 + 31);
+	      b[490] = *(a06 + 30);
+	      b[491] = *(a06 + 31);
+	      b[492] = *(a07 + 30);
+	      b[493] = *(a07 + 31);
+	      b[494] = *(a08 + 30);
+	      b[495] = *(a08 + 31);
+	      b[496] = *(a09 + 30);
+	      b[497] = *(a09 + 31);
+	      b[498] = *(a10 + 30);
+	      b[499] = *(a10 + 31);
+	      b[500] = *(a11 + 30);
+	      b[501] = *(a11 + 31);
+	      b[502] = *(a12 + 30);
+	      b[503] = *(a12 + 31);
+	      b[504] = *(a13 + 30);
+	      b[505] = *(a13 + 31);
+	      b[506] = *(a14 + 30);
+	      b[507] = *(a14 + 31);
+	      b[508] = *(a15 + 30);
+	      b[509] = *(a15 + 31);
+#ifdef UNIT
+	      b[510] = ONE;
+	      b[511] = ZERO;
+#else
+	      b[510] = *(a16 + 30);
+	      b[511] = *(a16 + 31);
+#endif
+
+	      a01 += 32;
+	      a02 += 32;
+	      a03 += 32;
+	      a04 += 32;
+	      a05 += 32;
+	      a06 += 32;
+	      a07 += 32;
+	      a08 += 32;
+	      a09 += 32;
+	      a10 += 32;
+	      a11 += 32;
+	      a12 += 32;
+	      a13 += 32;
+	      a14 += 32;
+	      a15 += 32;
+	      a16 += 32;
+	      b += 512;
+	    }
+
+	  X += 16;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 15);
+      if (i) {
+
+	if (X > posY) {
+
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+		b[  2] = *(a02 +  0);
+		b[  3] = *(a02 +  1);
+		b[  4] = *(a03 +  0);
+		b[  5] = *(a03 +  1);
+		b[  6] = *(a04 +  0);
+		b[  7] = *(a04 +  1);
+		b[  8] = *(a05 +  0);
+		b[  9] = *(a05 +  1);
+		b[ 10] = *(a06 +  0);
+		b[ 11] = *(a06 +  1);
+		b[ 12] = *(a07 +  0);
+		b[ 13] = *(a07 +  1);
+		b[ 14] = *(a08 +  0);
+		b[ 15] = *(a08 +  1);
+
+		b[ 16] = *(a09 +  0);
+		b[ 17] = *(a09 +  1);
+		b[ 18] = *(a10 +  0);
+		b[ 19] = *(a10 +  1);
+		b[ 20] = *(a11 +  0);
+		b[ 21] = *(a11 +  1);
+		b[ 22] = *(a12 +  0);
+		b[ 23] = *(a12 +  1);
+		b[ 24] = *(a13 +  0);
+		b[ 25] = *(a13 +  1);
+		b[ 26] = *(a14 +  0);
+		b[ 27] = *(a14 +  1);
+		b[ 28] = *(a15 +  0);
+		b[ 29] = *(a15 +  1);
+		b[ 30] = *(a16 +  0);
+		b[ 31] = *(a16 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    a05 += 2;
+	    a06 += 2;
+	    a07 += 2;
+	    a08 += 2;
+		a09 += 2;
+	    a10 += 2;
+	    a11 += 2;
+	    a12 += 2;
+	    a13 += 2;
+	    a14 += 2;
+	    a15 += 2;
+	    a16 += 2;
+	    b += 32;
+	  }
+	} else
+	  if (X < posY) {
+	    /* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda;
+	      a09 += i * lda;
+	      a10 += i * lda;
+	      a11 += i * lda;
+	      a12 += i * lda;
+	      a13 += i * lda;
+	      a14 += i * lda;
+	      a15 += i * lda;
+	      a16 += i * lda; */
+	    b += 32 * i;
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+		  b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+		  b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+	      b += 32;
+
+	      if (i >= 2) {
+		b[ 0] = *(a01 +  2);
+		b[ 1] = *(a01 +  3);
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+		b[  4] = ZERO;
+		b[  5] = ZERO;
+		b[  6] = ZERO;
+		b[  7] = ZERO;
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 3) {
+		b[ 0] = *(a01 +  4);
+		b[ 1] = *(a01 +  5);
+		b[ 2] = *(a02 +  4);
+		b[ 3] = *(a02 +  5);
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[  6] = ZERO;
+		b[  7] = ZERO;
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 4) {
+		b[ 0] = *(a01 +  6);
+		b[ 1] = *(a01 +  7);
+		b[ 2] = *(a02 +  6);
+		b[ 3] = *(a02 +  7);
+		b[ 4] = *(a03 +  6);
+		b[ 5] = *(a03 +  7);
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 5) {
+		b[ 0] = *(a01 +  8);
+		b[ 1] = *(a01 +  9);
+		b[ 2] = *(a02 +  8);
+		b[ 3] = *(a02 +  9);
+		b[ 4] = *(a03 +  8);
+		b[ 5] = *(a03 +  9);
+		b[ 6] = *(a04 +  8);
+		b[ 7] = *(a04 +  9);
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 6) {
+		b[ 0] = *(a01 + 10);
+		b[ 1] = *(a01 + 11);
+		b[ 2] = *(a02 + 10);
+		b[ 3] = *(a02 + 11);
+		b[ 4] = *(a03 + 10);
+		b[ 5] = *(a03 + 11);
+		b[ 6] = *(a04 + 10);
+		b[ 7] = *(a04 + 11);
+		b[ 8] = *(a05 + 10);
+		b[ 9] = *(a05 + 11);
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 7) {
+		b[ 0] = *(a01 + 12);
+		b[ 1] = *(a01 + 13);
+		b[ 2] = *(a02 + 12);
+		b[ 3] = *(a02 + 13);
+		b[ 4] = *(a03 + 12);
+		b[ 5] = *(a03 + 13);
+		b[ 6] = *(a04 + 12);
+		b[ 7] = *(a04 + 13);
+		b[ 8] = *(a05 + 12);
+		b[ 9] = *(a05 + 13);
+		b[10] = *(a06 + 12);
+		b[11] = *(a06 + 13);
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	    }
+
+		if (i >= 8) {
+	      b[  0] = *(a01 + 14);
+	      b[  1] = *(a01 + 15);
+	      b[  2] = *(a02 + 14);
+	      b[  3] = *(a02 + 15);
+	      b[  4] = *(a03 + 14);
+	      b[  5] = *(a03 + 15);
+	      b[  6] = *(a04 + 14);
+	      b[  7] = *(a04 + 15);
+	      b[  8] = *(a05 + 14);
+	      b[  9] = *(a05 + 15);
+	      b[ 10] = *(a06 + 14);
+	      b[ 11] = *(a06 + 15);
+	      b[ 12] = *(a07 + 14);
+	      b[ 13] = *(a07 + 15);
+#ifdef UNIT
+	      b[ 14] = ONE;
+	      b[ 15] = ZERO;
+#else
+	      b[ 14] = *(a08 +  14);
+	      b[ 15] = *(a08 +  15);
+#endif
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 9) {
+	      b[  0] = *(a01 + 16);
+	      b[  1] = *(a01 + 17);
+	      b[  2] = *(a02 + 16);
+	      b[  3] = *(a02 + 17);
+	      b[  4] = *(a03 + 16);
+	      b[  5] = *(a03 + 17);
+	      b[  6] = *(a04 + 16);
+	      b[  7] = *(a04 + 17);
+	      b[  8] = *(a05 + 16);
+	      b[  9] = *(a05 + 17);
+	      b[ 10] = *(a06 + 16);
+	      b[ 11] = *(a06 + 17);
+	      b[ 12] = *(a07 + 16);
+	      b[ 13] = *(a07 + 17);
+	      b[ 14] = *(a08 + 16);
+	      b[ 15] = *(a08 + 17);
+#ifdef UNIT
+	      b[ 16] = ONE;
+	      b[ 17] = ZERO;
+#else
+	      b[ 16] = *(a09 + 16);
+	      b[ 17] = *(a09 + 17);
+#endif
+	    b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 10) {
+	      b[  0] = *(a01 + 18);
+	      b[  1] = *(a01 + 19);
+	      b[  2] = *(a02 + 18);
+	      b[  3] = *(a02 + 19);
+	      b[  4] = *(a03 + 18);
+	      b[  5] = *(a03 + 19);
+	      b[  6] = *(a04 + 18);
+	      b[  7] = *(a04 + 19);
+	      b[  8] = *(a05 + 18);
+	      b[  9] = *(a05 + 19);
+	      b[ 10] = *(a06 + 18);
+	      b[ 11] = *(a06 + 19);
+	      b[ 12] = *(a07 + 18);
+	      b[ 13] = *(a07 + 19);
+	      b[ 14] = *(a08 + 18);
+	      b[ 15] = *(a08 + 19);
+	      b[ 16] = *(a09 + 18);
+	      b[ 17] = *(a09 + 19);
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a10 + 18);
+	      b[ 19] = *(a10 + 19);
+#endif
+	    b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 11) {
+	      b[  0] = *(a01 + 20);
+	      b[  1] = *(a01 + 21);
+	      b[  2] = *(a02 + 20);
+	      b[  3] = *(a02 + 21);
+	      b[  4] = *(a03 + 20);
+	      b[  5] = *(a03 + 21);
+	      b[  6] = *(a04 + 20);
+	      b[  7] = *(a04 + 21);
+	      b[  8] = *(a05 + 20);
+	      b[  9] = *(a05 + 21);
+	      b[ 10] = *(a06 + 20);
+	      b[ 11] = *(a06 + 21);
+	      b[ 12] = *(a07 + 20);
+	      b[ 13] = *(a07 + 21);
+	      b[ 14] = *(a08 + 20);
+	      b[ 15] = *(a08 + 21);
+	      b[ 16] = *(a09 + 20);
+	      b[ 17] = *(a09 + 21);
+	      b[ 18] = *(a10 + 20);
+	      b[ 19] = *(a10 + 21);
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a11 + 20);
+	      b[ 21] = *(a11 + 21);
+#endif
+	    b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 12) {
+	      b[  0] = *(a01 + 22);
+	      b[  1] = *(a01 + 23);
+	      b[  2] = *(a02 + 22);
+	      b[  3] = *(a02 + 23);
+	      b[  4] = *(a03 + 22);
+	      b[  5] = *(a03 + 23);
+	      b[  6] = *(a04 + 22);
+	      b[  7] = *(a04 + 23);
+	      b[  8] = *(a05 + 22);
+	      b[  9] = *(a05 + 23);
+	      b[ 10] = *(a06 + 22);
+	      b[ 11] = *(a06 + 23);
+	      b[ 12] = *(a07 + 22);
+	      b[ 13] = *(a07 + 23);
+	      b[ 14] = *(a08 + 22);
+	      b[ 15] = *(a08 + 23);
+	      b[ 16] = *(a09 + 22);
+	      b[ 17] = *(a09 + 23);
+	      b[ 18] = *(a10 + 22);
+	      b[ 19] = *(a10 + 23);
+	      b[ 20] = *(a11 + 22);
+	      b[ 21] = *(a11 + 23);
+#ifdef UNIT
+	      b[ 22] = ONE;
+	      b[ 23] = ZERO;
+#else
+	      b[ 22] = *(a12 + 22);
+	      b[ 23] = *(a12 + 23);
+#endif
+	    b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 13) {
+	      b[  0] = *(a01 + 24);
+	      b[  1] = *(a01 + 25);
+	      b[  2] = *(a02 + 24);
+	      b[  3] = *(a02 + 25);
+	      b[  4] = *(a03 + 24);
+	      b[  5] = *(a03 + 25);
+	      b[  6] = *(a04 + 24);
+	      b[  7] = *(a04 + 25);
+	      b[  8] = *(a05 + 24);
+	      b[  9] = *(a05 + 25);
+	      b[ 10] = *(a06 + 24);
+	      b[ 11] = *(a06 + 25);
+	      b[ 12] = *(a07 + 24);
+	      b[ 13] = *(a07 + 25);
+	      b[ 14] = *(a08 + 24);
+	      b[ 15] = *(a08 + 25);
+	      b[ 16] = *(a09 + 24);
+	      b[ 17] = *(a09 + 25);
+	      b[ 18] = *(a10 + 24);
+	      b[ 19] = *(a10 + 25);
+	      b[ 20] = *(a11 + 24);
+	      b[ 21] = *(a11 + 25);
+	      b[ 22] = *(a12 + 24);
+	      b[ 23] = *(a12 + 25);
+#ifdef UNIT
+	      b[ 24] = ONE;
+	      b[ 25] = ZERO;
+#else
+	      b[ 24] = *(a13 + 24);
+	      b[ 25] = *(a13 + 25);
+#endif
+	    b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 14) {
+	      b[  0] = *(a01 + 26);
+	      b[  1] = *(a01 + 27);
+	      b[  2] = *(a02 + 26);
+	      b[  3] = *(a02 + 27);
+	      b[  4] = *(a03 + 26);
+	      b[  5] = *(a03 + 27);
+	      b[  6] = *(a04 + 26);
+	      b[  7] = *(a04 + 27);
+	      b[  8] = *(a05 + 26);
+	      b[  9] = *(a05 + 27);
+	      b[ 10] = *(a06 + 26);
+	      b[ 11] = *(a06 + 27);
+	      b[ 12] = *(a07 + 26);
+	      b[ 13] = *(a07 + 27);
+	      b[ 14] = *(a08 + 26);
+	      b[ 15] = *(a08 + 27);
+	      b[ 16] = *(a09 + 26);
+	      b[ 17] = *(a09 + 27);
+	      b[ 18] = *(a10 + 26);
+	      b[ 19] = *(a10 + 27);
+	      b[ 20] = *(a11 + 26);
+	      b[ 21] = *(a11 + 27);
+	      b[ 22] = *(a12 + 26);
+	      b[ 23] = *(a12 + 27);
+	      b[ 24] = *(a13 + 26);
+	      b[ 25] = *(a13 + 27);
+#ifdef UNIT
+	      b[ 26] = ONE;
+	      b[ 27] = ZERO;
+#else
+	      b[ 26] = *(a14 + 26);
+	      b[ 27] = *(a14 + 27);
+#endif
+	    b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 15) {
+	      b[  0] = *(a01 + 28);
+	      b[  1] = *(a01 + 29);
+	      b[  2] = *(a02 + 28);
+	      b[  3] = *(a02 + 29);
+	      b[  4] = *(a03 + 28);
+	      b[  5] = *(a03 + 29);
+	      b[  6] = *(a04 + 28);
+	      b[  7] = *(a04 + 29);
+	      b[  8] = *(a05 + 28);
+	      b[  9] = *(a05 + 29);
+	      b[ 10] = *(a06 + 28);
+	      b[ 11] = *(a06 + 29);
+	      b[ 12] = *(a07 + 28);
+	      b[ 13] = *(a07 + 29);
+	      b[ 14] = *(a08 + 28);
+	      b[ 15] = *(a08 + 29);
+	      b[ 16] = *(a09 + 28);
+	      b[ 17] = *(a09 + 29);
+	      b[ 18] = *(a10 + 28);
+	      b[ 19] = *(a10 + 29);
+	      b[ 20] = *(a11 + 28);
+	      b[ 21] = *(a11 + 29);
+	      b[ 22] = *(a12 + 28);
+	      b[ 23] = *(a12 + 29);
+	      b[ 24] = *(a13 + 28);
+	      b[ 25] = *(a13 + 29);
+	      b[ 26] = *(a14 + 28);
+	      b[ 27] = *(a14 + 29);
+#ifdef UNIT
+	      b[ 28] = ONE;
+	      b[ 29] = ZERO;
+#else
+	      b[ 28] = *(a15 + 28);
+	      b[ 29] = *(a15 + 29);
+#endif
+	      b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+	      b += 32;
+	    }
+	  }
+      }
+
+      posY += 16;
+      js --;
+    } while (js > 0);
+  } /* End of main loop */
+
+
+  if (n & 8){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+      }
+
+      i = (m >> 3);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    for (ii = 0; ii < 8; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+
+	      a01 += 2;
+	      a02 += 2;
+	      a03 += 2;
+	      a04 += 2;
+	      a05 += 2;
+	      a06 += 2;
+	      a07 += 2;
+	      a08 += 2;
+	      b += 16;
+	    }
+	  } else
+	    if (X < posY) {
+	      a01 += 8 * lda;
+	      a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+
+	      b += 128;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+
+	      b[ 16] = *(a01 +  2);
+	      b[ 17] = *(a01 +  3);
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a02 +  2);
+	      b[ 19] = *(a02 +  3);
+#endif
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+	      b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+
+	      b[ 32] = *(a01 +  4);
+	      b[ 33] = *(a01 +  5);
+	      b[ 34] = *(a02 +  4);
+	      b[ 35] = *(a02 +  5);
+#ifdef UNIT
+	      b[ 36] = ONE;
+	      b[ 37] = ZERO;
+#else
+	      b[ 36] = *(a03 +  4);
+	      b[ 37] = *(a03 +  5);
+#endif
+	      b[ 38] = ZERO;
+	      b[ 39] = ZERO;
+	      b[ 40] = ZERO;
+	      b[ 41] = ZERO;
+	      b[ 42] = ZERO;
+	      b[ 43] = ZERO;
+	      b[ 44] = ZERO;
+	      b[ 45] = ZERO;
+	      b[ 46] = ZERO;
+	      b[ 47] = ZERO;
+
+	      b[ 48] = *(a01 +  6);
+	      b[ 49] = *(a01 +  7);
+	      b[ 50] = *(a02 +  6);
+	      b[ 51] = *(a02 +  7);
+	      b[ 52] = *(a03 +  6);
+	      b[ 53] = *(a03 +  7);
+#ifdef UNIT
+	      b[ 54] = ONE;
+	      b[ 55] = ZERO;
+#else
+	      b[ 54] = *(a04 +  6);
+	      b[ 55] = *(a04 +  7);
+#endif
+	      b[ 56] = ZERO;
+	      b[ 57] = ZERO;
+	      b[ 58] = ZERO;
+	      b[ 59] = ZERO;
+	      b[ 60] = ZERO;
+	      b[ 61] = ZERO;
+	      b[ 62] = ZERO;
+	      b[ 63] = ZERO;
+
+	      b[ 64] = *(a01 +  8);
+	      b[ 65] = *(a01 +  9);
+	      b[ 66] = *(a02 +  8);
+	      b[ 67] = *(a02 +  9);
+	      b[ 68] = *(a03 +  8);
+	      b[ 69] = *(a03 +  9);
+	      b[ 70] = *(a04 +  8);
+	      b[ 71] = *(a04 +  9);
+#ifdef UNIT
+	      b[ 72] = ONE;
+	      b[ 73] = ZERO;
+#else
+	      b[ 72] = *(a05 +  8);
+	      b[ 73] = *(a05 +  9);
+#endif
+	      b[ 74] = ZERO;
+	      b[ 75] = ZERO;
+	      b[ 76] = ZERO;
+	      b[ 77] = ZERO;
+	      b[ 78] = ZERO;
+	      b[ 79] = ZERO;
+
+	      b[ 80] = *(a01 + 10);
+	      b[ 81] = *(a01 + 11);
+	      b[ 82] = *(a02 + 10);
+	      b[ 83] = *(a02 + 11);
+	      b[ 84] = *(a03 + 10);
+	      b[ 85] = *(a03 + 11);
+	      b[ 86] = *(a04 + 10);
+	      b[ 87] = *(a04 + 11);
+	      b[ 88] = *(a05 + 10);
+	      b[ 89] = *(a05 + 11);
+#ifdef UNIT
+	      b[ 90] = ONE;
+	      b[ 91] = ZERO;
+#else
+	      b[ 90] = *(a06 + 10);
+	      b[ 91] = *(a06 + 11);
+#endif
+	      b[ 92] = ZERO;
+	      b[ 93] = ZERO;
+	      b[ 94] = ZERO;
+	      b[ 95] = ZERO;
+
+	      b[ 96] = *(a01 + 12);
+	      b[ 97] = *(a01 + 13);
+	      b[ 98] = *(a02 + 12);
+	      b[ 99] = *(a02 + 13);
+	      b[100] = *(a03 + 12);
+	      b[101] = *(a03 + 13);
+	      b[102] = *(a04 + 12);
+	      b[103] = *(a04 + 13);
+	      b[104] = *(a05 + 12);
+	      b[105] = *(a05 + 13);
+	      b[106] = *(a06 + 12);
+	      b[107] = *(a06 + 13);
+#ifdef UNIT
+	      b[108] = ONE;
+	      b[109] = ZERO;
+#else
+	      b[108] = *(a07 + 12);
+	      b[109] = *(a07 + 13);
+#endif
+	      b[110] = ZERO;
+	      b[111] = ZERO;
+
+	      b[112] = *(a01 + 14);
+	      b[113] = *(a01 + 15);
+	      b[114] = *(a02 + 14);
+	      b[115] = *(a02 + 15);
+	      b[116] = *(a03 + 14);
+	      b[117] = *(a03 + 15);
+	      b[118] = *(a04 + 14);
+	      b[119] = *(a04 + 15);
+	      b[120] = *(a05 + 14);
+	      b[121] = *(a05 + 15);
+	      b[122] = *(a06 + 14);
+	      b[123] = *(a06 + 15);
+	      b[124] = *(a07 + 14);
+	      b[125] = *(a07 + 15);
+#ifdef UNIT
+	      b[126] = ONE;
+	      b[127] = ZERO;
+#else
+	      b[126] = *(a08 + 14);
+	      b[127] = *(a08 + 15);
+#endif
+
+	      a01 += 16;
+	      a02 += 16;
+	      a03 += 16;
+	      a04 += 16;
+	      a05 += 16;
+	      a06 += 16;
+	      a07 += 16;
+	      a08 += 16;
+	      b += 128;
+	    }
+
+	  X += 8;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 7);
+      if (i) {
+
+	if (X > posY) {
+	  for (ii = 0; ii < i; ii++){
+
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a03 +  0);
+	    b[  5] = *(a03 +  1);
+	    b[  6] = *(a04 +  0);
+	    b[  7] = *(a04 +  1);
+
+	    b[  8] = *(a05 +  0);
+	    b[  9] = *(a05 +  1);
+	    b[ 10] = *(a06 +  0);
+	    b[ 11] = *(a06 +  1);
+	    b[ 12] = *(a07 +  0);
+	    b[ 13] = *(a07 +  1);
+	    b[ 14] = *(a08 +  0);
+	    b[ 15] = *(a08 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    a05 += 2;
+	    a06 += 2;
+	    a07 += 2;
+	    a08 += 2;
+	    b += 16;
+	    }
+	} else
+	  if (X < posY) {
+	      /* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda; */
+	      b += 16 * i;
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b += 16;
+
+	    if (i >= 2) {
+	    b[ 0] = *(a01 +  2);
+		b[ 1] = *(a01 +  3);
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+	    b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 3) {
+		b[ 0] = *(a01 +  4);
+		b[ 1] = *(a01 +  5);
+		b[ 2] = *(a02 +  4);
+		b[ 3] = *(a02 +  5);
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 4) {
+		b[ 0] = *(a01 +  6);
+		b[ 1] = *(a01 +  7);
+		b[ 2] = *(a02 +  6);
+		b[ 3] = *(a02 +  7);
+		b[ 4] = *(a03 +  6);
+		b[ 5] = *(a03 +  7);
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 5) {
+		b[ 0] = *(a01 +  8);
+		b[ 1] = *(a01 +  9);
+		b[ 2] = *(a02 +  8);
+		b[ 3] = *(a02 +  9);
+		b[ 4] = *(a03 +  8);
+		b[ 5] = *(a03 +  9);
+		b[ 6] = *(a04 +  8);
+		b[ 7] = *(a04 +  9);
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[10] = ZERO;
+		b[11] = ZERO;
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 6) {
+		b[ 0] = *(a01 + 10);
+		b[ 1] = *(a01 + 11);
+		b[ 2] = *(a02 + 10);
+		b[ 3] = *(a02 + 11);
+		b[ 4] = *(a03 + 10);
+		b[ 5] = *(a03 + 11);
+		b[ 6] = *(a04 + 10);
+		b[ 7] = *(a04 + 11);
+		b[ 8] = *(a05 + 10);
+		b[ 9] = *(a05 + 11);
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[12] = ZERO;
+		b[13] = ZERO;
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+
+	    if (i >= 7) {
+		b[ 0] = *(a01 + 12);
+		b[ 1] = *(a01 + 13);
+		b[ 2] = *(a02 + 12);
+		b[ 3] = *(a02 + 13);
+		b[ 4] = *(a03 + 12);
+		b[ 5] = *(a03 + 13);
+		b[ 6] = *(a04 + 12);
+		b[ 7] = *(a04 + 13);
+		b[ 8] = *(a05 + 12);
+		b[ 9] = *(a05 + 13);
+		b[10] = *(a06 + 12);
+		b[11] = *(a06 + 13);
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[14] = ZERO;
+		b[15] = ZERO;
+		b += 16;
+	    }
+	  }
+      }
+
+      posY += 8;
+  }
+
+
+  if (n & 4){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+      }
+
+      i = (m >> 2);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    for (ii = 0; ii < 4; ii++){
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      a01 += 2;
+	      a02 += 2;
+	      a03 += 2;
+	      a04 += 2;
+	      b += 8;
+	    }
+	  } else
+	    if (X < posY) {
+	      a01 += 4 * lda;
+	      a02 += 4 * lda;
+	      a03 += 4 * lda;
+	      a04 += 4 * lda;
+	      b += 32;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+
+	      b[  8] = *(a01 +  2);
+	      b[  9] = *(a01 +  3);
+#ifdef UNIT
+	      b[ 10] = ONE;
+	      b[ 11] = ZERO;
+#else
+	      b[ 10] = *(a02 +  2);
+	      b[ 11] = *(a02 +  3);
+#endif
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+
+	      b[ 16] = *(a01 +  4);
+	      b[ 17] = *(a01 +  5);
+	      b[ 18] = *(a02 +  4);
+	      b[ 19] = *(a02 +  5);
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a03 +  4);
+	      b[ 21] = *(a03 +  5);
+#endif
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+
+	      b[ 24] = *(a01 +  6);
+	      b[ 25] = *(a01 +  7);
+	      b[ 26] = *(a02 +  6);
+	      b[ 27] = *(a02 +  7);
+	      b[ 28] = *(a03 +  6);
+	      b[ 29] = *(a03 +  7);
+#ifdef UNIT
+	      b[ 30] = ONE;
+	      b[ 31] = ZERO;
+#else
+	      b[ 30] = *(a04 +  6);
+	      b[ 31] = *(a04 +  7);
+#endif
+
+	      a01 += 8;
+	      a02 += 8;
+	      a03 += 8;
+	      a04 += 8;
+	      b += 32;
+	    }
+
+	  X += 4;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 3);
+      if (i) {
+
+	if (X > posY) {
+
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a03 +  0);
+	    b[  5] = *(a03 +  1);
+	    b[  6] = *(a04 +  0);
+	    b[  7] = *(a04 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    b += 8;
+	  }
+	} else
+	  if (X < posY) {
+	    /* a01 += i * lda;
+	    a02 += i * lda;
+	    a03 += i * lda;
+	    a04 += i * lda; */
+	    b += 8 * i;
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = ZERO;
+	    b[  3] = ZERO;
+	    b[  4] = ZERO;
+	    b[  5] = ZERO;
+	    b[  6] = ZERO;
+	    b[  7] = ZERO;
+	    b += 8;
+
+	    if (i >= 2) {
+	      b[ 0] = *(a01 +  2);
+	      b[ 1] = *(a01 +  3);
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b += 8;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = *(a01 +  4);
+	      b[ 1] = *(a01 +  5);
+	      b[ 2] = *(a02 +  4);
+	      b[ 3] = *(a02 +  5);
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b += 8;
+	    }
+	  }
+      }
+
+      posY += 4;
+  }
+
+  if (n & 2){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+      }
+
+      i = (m >> 1);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a01 +  2);
+	    b[  5] = *(a01 +  3);
+	    b[  6] = *(a02 +  2);
+	    b[  7] = *(a02 +  3);
+
+	    a01 += 4;
+	    a02 += 4;
+	    b += 8;
+	  } else
+	    if (X < posY) {
+	      a01 += 2 * lda;
+	      a02 += 2 * lda;
+	      b += 8;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+
+	      b[  4] = *(a01 +  2);
+	      b[  5] = *(a01 +  3);
+#ifdef UNIT
+	      b[  6] = ONE;
+	      b[  7] = ZERO;
+#else
+	      b[  6] = *(a02 +  2);
+	      b[  7] = *(a02 +  3);
+#endif
+	      a01 += 4;
+	      a02 += 4;
+	      b += 8;
+	    }
+
+	  X += 2;
+	  i --;
+	} while (i > 0);
+      }
+
+      if (m & 1) {
+
+	if (X > posY) {
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+	  b[  2] = *(a02 +  0);
+	  b[  3] = *(a02 +  1);
+	  /* a01 += 2;
+	  a02 += 2; */
+	  b += 4;
+	} else
+	  if (X < posY) {
+	    /* a01 += 2 * lda;
+	    a02 += 2 * lda; */
+	    b += 4;
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = ZERO;
+	    b[  3] = ZERO;
+	    b += 4;
+	  }
+      }
+      posY += 2;
+  }
+
+  if (n & 1){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+      }
+
+      i = m;
+      if (m > 0) {
+	do {
+	  if (X > posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    a01 += 2;
+	    b += 2;
+	  } else
+	    if (X < posY) {
+	      a01 += lda;
+	      b += 2;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      a01 += 2;
+	      b += 2;
+	    }
+
+	  X += 1;
+	  i --;
+	} while (i > 0);
+      }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrmm_ltcopy_16.c b/kernel/generic/ztrmm_ltcopy_16.c
new file mode 100644
index 000000000..8d585e70b
--- /dev/null
+++ b/kernel/generic/ztrmm_ltcopy_16.c
@@ -0,0 +1,2313 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js;
+  BLASLONG X, ii;
+
+  FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08;
+  FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  lda += lda;
+
+  js = (n >> 4);
+
+  if (js > 0){
+    do {
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+	a09 = a + posY * 2 + (posX + 8) * lda;
+	a10 = a + posY * 2 + (posX + 9) * lda;
+	a11 = a + posY * 2 + (posX + 10) * lda;
+	a12 = a + posY * 2 + (posX + 11) * lda;
+	a13 = a + posY * 2 + (posX + 12) * lda;
+	a14 = a + posY * 2 + (posX + 13) * lda;
+	a15 = a + posY * 2 + (posX + 14) * lda;
+	a16 = a + posY * 2 + (posX + 15) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+	a09 = a + posX * 2 + (posY + 8) * lda;
+	a10 = a + posX * 2 + (posY + 9) * lda;
+	a11 = a + posX * 2 + (posY + 10) * lda;
+	a12 = a + posX * 2 + (posY + 11) * lda;
+	a13 = a + posX * 2 + (posY + 12) * lda;
+	a14 = a + posX * 2 + (posY + 13) * lda;
+	a15 = a + posX * 2 + (posY + 14) * lda;
+	a16 = a + posX * 2 + (posY + 15) * lda;
+      }
+
+      i = (m >> 4);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+		a01 += 32;
+		a02 += 32;
+		a03 += 32;
+		a04 += 32;
+		a05 += 32;
+		a06 += 32;
+		a07 += 32;
+		a08 += 32;
+		a09 += 32;
+		a10 += 32;
+		a11 += 32;
+		a12 += 32;
+		a13 += 32;
+		a14 += 32;
+		a15 += 32;
+		a16 += 32;
+		b += 512;
+	  } else
+	    if (X < posY) {
+	      for (ii = 0; ii < 16; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+		  b[ 16] = *(a01 + 16);
+	      b[ 17] = *(a01 + 17);
+	      b[ 18] = *(a01 + 18);
+	      b[ 19] = *(a01 + 19);
+	      b[ 20] = *(a01 + 20);
+	      b[ 21] = *(a01 + 21);
+	      b[ 22] = *(a01 + 22);
+	      b[ 23] = *(a01 + 23);
+
+	      b[ 24] = *(a01 + 24);
+	      b[ 25] = *(a01 + 25);
+	      b[ 26] = *(a01 + 26);
+	      b[ 27] = *(a01 + 27);
+	      b[ 28] = *(a01 + 28);
+	      b[ 29] = *(a01 + 29);
+	      b[ 30] = *(a01 + 30);
+	      b[ 31] = *(a01 + 31);
+
+	      a01 += lda;
+	      b += 32;
+	    }
+		a02 += 16 * lda;
+		a03 += 16 * lda;
+		a04 += 16 * lda;
+		a05 += 16 * lda;
+		a06 += 16 * lda;
+		a07 += 16 * lda;
+		a08 += 16 * lda;
+		a09 += 16 * lda;
+		a10 += 16 * lda;
+		a11 += 16 * lda;
+		a12 += 16 * lda;
+		a13 += 16 * lda;
+		a14 += 16 * lda;
+		a15 += 16 * lda;
+		a16 += 16 * lda;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+		  b[ 16] = *(a01 + 16);
+	      b[ 17] = *(a01 + 17);
+	      b[ 18] = *(a01 + 18);
+	      b[ 19] = *(a01 + 19);
+	      b[ 20] = *(a01 + 20);
+	      b[ 21] = *(a01 + 21);
+	      b[ 22] = *(a01 + 22);
+	      b[ 23] = *(a01 + 23);
+	      b[ 24] = *(a01 + 24);
+	      b[ 25] = *(a01 + 25);
+	      b[ 26] = *(a01 + 26);
+	      b[ 27] = *(a01 + 27);
+	      b[ 28] = *(a01 + 28);
+	      b[ 29] = *(a01 + 29);
+		  b[ 30] = *(a01 + 30);
+	      b[ 31] = *(a01 + 31);
+
+	      b[ 32] = ZERO;
+	      b[ 33] = ZERO;
+#ifdef UNIT
+	      b[ 34] = ONE;
+	      b[ 35] = ZERO;
+#else
+	      b[ 34] = *(a02 +  2);
+	      b[ 35] = *(a02 +  3);
+#endif
+	      b[ 36] = *(a02 +  4);
+	      b[ 37] = *(a02 +  5);
+	      b[ 38] = *(a02 +  6);
+	      b[ 39] = *(a02 +  7);
+	      b[ 40] = *(a02 +  8);
+	      b[ 41] = *(a02 +  9);
+	      b[ 42] = *(a02 + 10);
+	      b[ 43] = *(a02 + 11);
+	      b[ 44] = *(a02 + 12);
+	      b[ 45] = *(a02 + 13);
+	      b[ 46] = *(a02 + 14);
+	      b[ 47] = *(a02 + 15);
+	      b[ 48] = *(a02 + 16);
+	      b[ 49] = *(a02 + 17);
+	      b[ 50] = *(a02 + 18);
+	      b[ 51] = *(a02 + 19);
+	      b[ 52] = *(a02 + 20);
+	      b[ 53] = *(a02 + 21);
+	      b[ 54] = *(a02 + 22);
+	      b[ 55] = *(a02 + 23);
+	      b[ 56] = *(a02 + 24);
+	      b[ 57] = *(a02 + 25);
+	      b[ 58] = *(a02 + 26);
+	      b[ 59] = *(a02 + 27);
+	      b[ 60] = *(a02 + 28);
+	      b[ 61] = *(a02 + 29);
+	      b[ 62] = *(a02 + 30);
+	      b[ 63] = *(a02 + 31);
+
+	      b[ 64] = ZERO;
+	      b[ 65] = ZERO;
+	      b[ 66] = ZERO;
+	      b[ 67] = ZERO;
+#ifdef UNIT
+	      b[ 68] = ONE;
+	      b[ 69] = ZERO;
+#else
+	      b[ 68] = *(a03 +  4);
+	      b[ 69] = *(a03 +  5);
+#endif
+	      b[ 70] = *(a03 +  6);
+	      b[ 71] = *(a03 +  7);
+	      b[ 72] = *(a03 +  8);
+	      b[ 73] = *(a03 +  9);
+	      b[ 74] = *(a03 + 10);
+	      b[ 75] = *(a03 + 11);
+	      b[ 76] = *(a03 + 12);
+	      b[ 77] = *(a03 + 13);
+	      b[ 78] = *(a03 + 14);
+	      b[ 79] = *(a03 + 15);
+	      b[ 80] = *(a03 + 16);
+	      b[ 81] = *(a03 + 17);
+	      b[ 82] = *(a03 + 18);
+		  b[ 83] = *(a03 + 19);
+	      b[ 84] = *(a03 + 20);
+	      b[ 85] = *(a03 + 21);
+	      b[ 86] = *(a03 + 22);
+	      b[ 87] = *(a03 + 23);
+	      b[ 88] = *(a03 + 24);
+	      b[ 89] = *(a03 + 25);
+	      b[ 90] = *(a03 + 26);
+	      b[ 91] = *(a03 + 27);
+	      b[ 92] = *(a03 + 28);
+	      b[ 93] = *(a03 + 29);
+	      b[ 94] = *(a03 + 30);
+	      b[ 95] = *(a03 + 31);
+
+	      b[ 96] = ZERO;
+	      b[ 97] = ZERO;
+	      b[ 98] = ZERO;
+	      b[ 99] = ZERO;
+	      b[100] = ZERO;
+	      b[101] = ZERO;
+#ifdef UNIT
+	      b[102] = ONE;
+	      b[103] = ZERO;
+#else
+	      b[102] = *(a04 +  6);
+	      b[103] = *(a04 +  7);
+#endif
+	      b[104] = *(a04 +  8);
+	      b[105] = *(a04 +  9);
+	      b[106] = *(a04 + 10);
+	      b[107] = *(a04 + 11);
+	      b[108] = *(a04 + 12);
+	      b[109] = *(a04 + 13);
+	      b[110] = *(a04 + 14);
+	      b[111] = *(a04 + 15);
+	      b[112] = *(a04 + 16);
+	      b[113] = *(a04 + 17);
+	      b[114] = *(a04 + 18);
+	      b[115] = *(a04 + 19);
+		  b[116] = *(a04 + 20);
+	      b[117] = *(a04 + 21);
+	      b[118] = *(a04 + 22);
+	      b[119] = *(a04 + 23);
+	      b[120] = *(a04 + 24);
+	      b[121] = *(a04 + 25);
+	      b[122] = *(a04 + 26);
+	      b[123] = *(a04 + 27);
+	      b[124] = *(a04 + 28);
+	      b[125] = *(a04 + 29);
+	      b[126] = *(a04 + 30);
+	      b[127] = *(a04 + 31);
+
+	      b[128] = ZERO;
+	      b[129] = ZERO;
+	      b[130] = ZERO;
+	      b[131] = ZERO;
+	      b[132] = ZERO;
+	      b[133] = ZERO;
+	      b[134] = ZERO;
+	      b[135] = ZERO;
+#ifdef UNIT
+	      b[136] = ONE;
+	      b[137] = ZERO;
+#else
+	      b[136] = *(a05 +  8);
+	      b[137] = *(a05 +  9);
+#endif
+	      b[138] = *(a05 + 10);
+	      b[139] = *(a05 + 11);
+	      b[140] = *(a05 + 12);
+	      b[141] = *(a05 + 13);
+	      b[142] = *(a05 + 14);
+	      b[143] = *(a05 + 15);
+	      b[144] = *(a05 + 16);
+	      b[145] = *(a05 + 17);
+	      b[146] = *(a05 + 18);
+	      b[147] = *(a05 + 19);
+	      b[148] = *(a05 + 20);
+		  b[149] = *(a05 + 21);
+	      b[150] = *(a05 + 22);
+	      b[151] = *(a05 + 23);
+	      b[152] = *(a05 + 24);
+	      b[153] = *(a05 + 25);
+	      b[154] = *(a05 + 26);
+	      b[155] = *(a05 + 27);
+	      b[156] = *(a05 + 28);
+	      b[157] = *(a05 + 29);
+	      b[158] = *(a05 + 30);
+	      b[159] = *(a05 + 31);
+
+	      b[160] = ZERO;
+	      b[161] = ZERO;
+	      b[162] = ZERO;
+	      b[163] = ZERO;
+	      b[164] = ZERO;
+	      b[165] = ZERO;
+	      b[166] = ZERO;
+	      b[167] = ZERO;
+	      b[168] = ZERO;
+	      b[169] = ZERO;
+#ifdef UNIT
+	      b[170] = ONE;
+	      b[171] = ZERO;
+#else
+	      b[170] = *(a06 + 10);
+	      b[171] = *(a06 + 11);
+#endif
+	      b[172] = *(a06 + 12);
+	      b[173] = *(a06 + 13);
+	      b[174] = *(a06 + 14);
+	      b[175] = *(a06 + 15);
+	      b[176] = *(a06 + 16);
+	      b[177] = *(a06 + 17);
+	      b[178] = *(a06 + 18);
+	      b[179] = *(a06 + 19);
+	      b[180] = *(a06 + 20);
+	      b[181] = *(a06 + 21);
+		  b[182] = *(a06 + 22);
+	      b[183] = *(a06 + 23);
+	      b[184] = *(a06 + 24);
+	      b[185] = *(a06 + 25);
+	      b[186] = *(a06 + 26);
+	      b[187] = *(a06 + 27);
+	      b[188] = *(a06 + 28);
+	      b[189] = *(a06 + 29);
+	      b[190] = *(a06 + 30);
+	      b[191] = *(a06 + 31);
+
+	      b[192] = ZERO;
+	      b[193] = ZERO;
+	      b[194] = ZERO;
+	      b[195] = ZERO;
+	      b[196] = ZERO;
+	      b[197] = ZERO;
+	      b[198] = ZERO;
+	      b[199] = ZERO;
+	      b[200] = ZERO;
+	      b[201] = ZERO;
+	      b[202] = ZERO;
+	      b[203] = ZERO;
+#ifdef UNIT
+	      b[204] = ONE;
+	      b[205] = ZERO;
+#else
+	      b[204] = *(a07 + 12);
+	      b[205] = *(a07 + 13);
+#endif
+	      b[206] = *(a07 + 14);
+	      b[207] = *(a07 + 15);
+	      b[208] = *(a07 + 16);
+	      b[209] = *(a07 + 17);
+	      b[210] = *(a07 + 18);
+	      b[211] = *(a07 + 19);
+	      b[212] = *(a07 + 20);
+	      b[213] = *(a07 + 21);
+	      b[214] = *(a07 + 22);
+		  b[215] = *(a07 + 23);
+	      b[216] = *(a07 + 24);
+	      b[217] = *(a07 + 25);
+	      b[218] = *(a07 + 26);
+	      b[219] = *(a07 + 27);
+	      b[220] = *(a07 + 28);
+	      b[221] = *(a07 + 29);
+	      b[222] = *(a07 + 30);
+	      b[223] = *(a07 + 31);
+
+	      b[224] = ZERO;
+	      b[225] = ZERO;
+	      b[226] = ZERO;
+	      b[227] = ZERO;
+	      b[228] = ZERO;
+	      b[229] = ZERO;
+	      b[230] = ZERO;
+	      b[231] = ZERO;
+	      b[232] = ZERO;
+	      b[233] = ZERO;
+	      b[234] = ZERO;
+	      b[235] = ZERO;
+	      b[236] = ZERO;
+	      b[237] = ZERO;
+#ifdef UNIT
+	      b[238] = ONE;
+	      b[239] = ZERO;
+#else
+	      b[238] = *(a08 + 14);
+	      b[239] = *(a08 + 15);
+#endif
+		  b[240] = *(a08 + 16);
+	      b[241] = *(a08 + 17);
+	      b[242] = *(a08 + 18);
+	      b[243] = *(a08 + 19);
+	      b[244] = *(a08 + 20);
+	      b[245] = *(a08 + 21);
+	      b[246] = *(a08 + 22);
+	      b[247] = *(a08 + 23);
+		  b[248] = *(a08 + 24);
+	      b[249] = *(a08 + 25);
+	      b[250] = *(a08 + 26);
+	      b[251] = *(a08 + 27);
+	      b[252] = *(a08 + 28);
+	      b[253] = *(a08 + 29);
+	      b[254] = *(a08 + 30);
+	      b[255] = *(a08 + 31);
+
+	      b[256] = ZERO;
+	      b[257] = ZERO;
+	      b[258] = ZERO;
+	      b[259] = ZERO;
+	      b[260] = ZERO;
+	      b[261] = ZERO;
+	      b[262] = ZERO;
+	      b[263] = ZERO;
+	      b[264] = ZERO;
+	      b[265] = ZERO;
+	      b[266] = ZERO;
+	      b[267] = ZERO;
+	      b[268] = ZERO;
+	      b[269] = ZERO;
+	      b[270] = ZERO;
+	      b[271] = ZERO;
+#ifdef UNIT
+	      b[272] = ONE;
+		  b[273] = ZERO;
+#else
+	      b[272] = *(a09 + 16);
+		  b[273] = *(a09 + 17);
+#endif
+		  b[274] = *(a09 + 18);
+	      b[275] = *(a09 + 19);
+	      b[276] = *(a09 + 20);
+	      b[277] = *(a09 + 21);
+	      b[278] = *(a09 + 22);
+	      b[279] = *(a09 + 23);
+	      b[280] = *(a09 + 24);
+		  b[281] = *(a09 + 25);
+	      b[282] = *(a09 + 26);
+	      b[283] = *(a09 + 27);
+	      b[284] = *(a09 + 28);
+	      b[285] = *(a09 + 29);
+	      b[286] = *(a09 + 30);
+	      b[287] = *(a09 + 31);
+
+		  b[288] = ZERO;
+		  b[289] = ZERO;
+	      b[290] = ZERO;
+	      b[291] = ZERO;
+	      b[292] = ZERO;
+	      b[293] = ZERO;
+	      b[294] = ZERO;
+	      b[295] = ZERO;
+	      b[296] = ZERO;
+	      b[297] = ZERO;
+	      b[298] = ZERO;
+	      b[299] = ZERO;
+	      b[300] = ZERO;
+	      b[301] = ZERO;
+	      b[302] = ZERO;
+	      b[303] = ZERO;
+	      b[304] = ZERO;
+	      b[305] = ZERO;
+#ifdef UNIT
+	      b[306] = ONE;
+		  b[307] = ZERO;
+#else
+	      b[306] = *(a10 + 18);
+		  b[307] = *(a10 + 19);
+#endif
+		  b[308] = *(a10 + 20);
+	      b[309] = *(a10 + 21);
+	      b[310] = *(a10 + 22);
+	      b[311] = *(a10 + 23);
+	      b[312] = *(a10 + 24);
+	      b[313] = *(a10 + 25);
+		  b[314] = *(a10 + 26);
+	      b[315] = *(a10 + 27);
+	      b[316] = *(a10 + 28);
+	      b[317] = *(a10 + 29);
+	      b[318] = *(a10 + 30);
+	      b[319] = *(a10 + 31);
+
+		  b[320] = ZERO;
+		  b[321] = ZERO;
+	      b[322] = ZERO;
+	      b[323] = ZERO;
+	      b[324] = ZERO;
+	      b[325] = ZERO;
+	      b[326] = ZERO;
+	      b[327] = ZERO;
+	      b[328] = ZERO;
+	      b[329] = ZERO;
+	      b[330] = ZERO;
+	      b[331] = ZERO;
+	      b[332] = ZERO;
+	      b[333] = ZERO;
+	      b[334] = ZERO;
+	      b[335] = ZERO;
+	      b[336] = ZERO;
+	      b[337] = ZERO;
+	      b[338] = ZERO;
+	      b[339] = ZERO;
+#ifdef UNIT
+	      b[340] = ONE;
+	      b[341] = ZERO;
+#else
+	      b[340] = *(a11 + 20);
+	      b[341] = *(a11 + 21);
+#endif
+		  b[342] = *(a11 + 22);
+	      b[343] = *(a11 + 23);
+	      b[344] = *(a11 + 24);
+	      b[345] = *(a11 + 25);
+	      b[346] = *(a11 + 26);
+		  b[347] = *(a11 + 27);
+	      b[348] = *(a11 + 28);
+	      b[349] = *(a11 + 29);
+	      b[350] = *(a11 + 30);
+	      b[351] = *(a11 + 31);
+
+		  b[352] = ZERO;
+		  b[353] = ZERO;
+	      b[354] = ZERO;
+	      b[355] = ZERO;
+	      b[356] = ZERO;
+	      b[357] = ZERO;
+	      b[358] = ZERO;
+	      b[359] = ZERO;
+	      b[360] = ZERO;
+	      b[361] = ZERO;
+	      b[362] = ZERO;
+	      b[363] = ZERO;
+	      b[364] = ZERO;
+	      b[365] = ZERO;
+	      b[366] = ZERO;
+	      b[367] = ZERO;
+	      b[368] = ZERO;
+	      b[369] = ZERO;
+	      b[370] = ZERO;
+	      b[371] = ZERO;
+	      b[372] = ZERO;
+	      b[373] = ZERO;
+#ifdef UNIT
+	      b[374] = ONE;
+	      b[375] = ZERO;
+#else
+	      b[374] = *(a12 + 22);
+	      b[375] = *(a12 + 23);
+#endif
+		  b[376] = *(a12 + 24);
+	      b[377] = *(a12 + 25);
+	      b[378] = *(a12 + 26);
+	      b[379] = *(a12 + 27);
+		  b[380] = *(a12 + 28);
+	      b[381] = *(a12 + 29);
+	      b[382] = *(a12 + 30);
+	      b[383] = *(a12 + 31);
+
+		  b[384] = ZERO;
+		  b[385] = ZERO;
+	      b[386] = ZERO;
+	      b[387] = ZERO;
+	      b[388] = ZERO;
+	      b[389] = ZERO;
+	      b[390] = ZERO;
+	      b[391] = ZERO;
+	      b[392] = ZERO;
+	      b[393] = ZERO;
+	      b[394] = ZERO;
+	      b[395] = ZERO;
+	      b[396] = ZERO;
+	      b[397] = ZERO;
+	      b[398] = ZERO;
+	      b[399] = ZERO;
+	      b[400] = ZERO;
+	      b[401] = ZERO;
+	      b[402] = ZERO;
+	      b[403] = ZERO;
+	      b[404] = ZERO;
+	      b[405] = ZERO;
+	      b[406] = ZERO;
+	      b[407] = ZERO;
+#ifdef UNIT
+	      b[408] = ONE;
+	      b[409] = ZERO;
+#else
+	      b[408] = *(a13 + 24);
+	      b[409] = *(a13 + 25);
+#endif
+		  b[410] = *(a13 + 26);
+	      b[411] = *(a13 + 27);
+	      b[412] = *(a13 + 28);
+		  b[413] = *(a13 + 29);
+	      b[414] = *(a13 + 30);
+	      b[415] = *(a13 + 31);
+
+		  b[416] = ZERO;
+		  b[417] = ZERO;
+	      b[418] = ZERO;
+	      b[419] = ZERO;
+	      b[420] = ZERO;
+	      b[421] = ZERO;
+	      b[422] = ZERO;
+	      b[423] = ZERO;
+	      b[424] = ZERO;
+	      b[425] = ZERO;
+	      b[426] = ZERO;
+	      b[427] = ZERO;
+	      b[428] = ZERO;
+	      b[429] = ZERO;
+	      b[430] = ZERO;
+	      b[431] = ZERO;
+	      b[432] = ZERO;
+	      b[433] = ZERO;
+	      b[434] = ZERO;
+	      b[435] = ZERO;
+	      b[436] = ZERO;
+	      b[437] = ZERO;
+	      b[438] = ZERO;
+	      b[439] = ZERO;
+	      b[440] = ZERO;
+	      b[441] = ZERO;
+#ifdef UNIT
+	      b[442] = ONE;
+	      b[443] = ZERO;
+#else
+	      b[442] = *(a14 + 26);
+	      b[443] = *(a14 + 27);
+#endif
+		  b[444] = *(a14 + 28);
+	      b[445] = *(a14 + 29);
+		  b[446] = *(a14 + 30);
+	      b[447] = *(a14 + 31);
+
+		  b[448] = ZERO;
+		  b[449] = ZERO;
+	      b[450] = ZERO;
+	      b[451] = ZERO;
+	      b[452] = ZERO;
+	      b[453] = ZERO;
+	      b[454] = ZERO;
+	      b[455] = ZERO;
+	      b[456] = ZERO;
+	      b[457] = ZERO;
+	      b[458] = ZERO;
+	      b[459] = ZERO;
+	      b[460] = ZERO;
+	      b[461] = ZERO;
+	      b[462] = ZERO;
+	      b[463] = ZERO;
+	      b[464] = ZERO;
+	      b[465] = ZERO;
+	      b[466] = ZERO;
+	      b[467] = ZERO;
+	      b[468] = ZERO;
+	      b[469] = ZERO;
+	      b[470] = ZERO;
+	      b[471] = ZERO;
+	      b[472] = ZERO;
+	      b[473] = ZERO;
+	      b[474] = ZERO;
+	      b[475] = ZERO;
+#ifdef UNIT
+	      b[476] = ONE;
+	      b[477] = ZERO;
+#else
+	      b[476] = *(a15 + 28);
+	      b[477] = *(a15 + 29);
+#endif
+		  b[478] = *(a15 + 30);
+		  b[479] = *(a15 + 31);
+
+		  b[480] = ZERO;
+		  b[481] = ZERO;
+	      b[482] = ZERO;
+	      b[483] = ZERO;
+	      b[484] = ZERO;
+	      b[485] = ZERO;
+	      b[486] = ZERO;
+	      b[487] = ZERO;
+	      b[488] = ZERO;
+	      b[489] = ZERO;
+	      b[490] = ZERO;
+	      b[491] = ZERO;
+	      b[492] = ZERO;
+	      b[493] = ZERO;
+	      b[494] = ZERO;
+	      b[495] = ZERO;
+	      b[496] = ZERO;
+	      b[497] = ZERO;
+	      b[498] = ZERO;
+	      b[499] = ZERO;
+	      b[500] = ZERO;
+	      b[501] = ZERO;
+	      b[502] = ZERO;
+	      b[503] = ZERO;
+	      b[504] = ZERO;
+	      b[505] = ZERO;
+	      b[506] = ZERO;
+	      b[507] = ZERO;
+	      b[508] = ZERO;
+	      b[509] = ZERO;
+#ifdef UNIT
+	      b[510] = ONE;
+	      b[511] = ZERO;
+#else
+	      b[510] = *(a16 + 30);
+	      b[511] = *(a16 + 31);
+#endif
+
+	      a01 += 32;
+	      a02 += 32;
+	      a03 += 32;
+	      a04 += 32;
+	      a05 += 32;
+	      a06 += 32;
+	      a07 += 32;
+	      a08 += 32;
+	      a09 += 32;
+	      a10 += 32;
+	      a11 += 32;
+	      a12 += 32;
+	      a13 += 32;
+	      a14 += 32;
+	      a15 += 32;
+	      a16 += 32;
+	      b += 512;
+	    }
+
+	  X += 16;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 15);
+      if (i) {
+
+	if (X > posY) {
+		/* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda;
+	      a09 += i * lda;
+	      a10 += i * lda;
+	      a11 += i * lda;
+	      a12 += i * lda;
+	      a13 += i * lda;
+	      a14 += i * lda;
+	      a15 += i * lda;
+	      a16 += i * lda; */
+	    b += 32 * i;
+	} else
+	  if (X < posY) {
+	    for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+		b[  2] = *(a01 +  2);
+		b[  3] = *(a01 +  3);
+		b[  4] = *(a01 +  4);
+		b[  5] = *(a01 +  5);
+		b[  6] = *(a01 +  6);
+		b[  7] = *(a01 +  7);
+		b[  8] = *(a01 +  8);
+		b[  9] = *(a01 +  9);
+		b[ 10] = *(a01 + 10);
+		b[ 11] = *(a01 + 11);
+		b[ 12] = *(a01 + 12);
+		b[ 13] = *(a01 + 13);
+		b[ 14] = *(a01 + 14);
+		b[ 15] = *(a01 + 15);
+
+		b[ 16] = *(a01 + 16);
+		b[ 17] = *(a01 + 17);
+		b[ 18] = *(a01 + 18);
+		b[ 19] = *(a01 + 19);
+		b[ 20] = *(a01 + 20);
+		b[ 21] = *(a01 + 21);
+		b[ 22] = *(a01 + 22);
+		b[ 23] = *(a01 + 23);
+		b[ 24] = *(a01 + 24);
+		b[ 25] = *(a01 + 25);
+		b[ 26] = *(a01 + 26);
+		b[ 27] = *(a01 + 27);
+		b[ 28] = *(a01 + 28);
+		b[ 29] = *(a01 + 29);
+		b[ 30] = *(a01 + 30);
+		b[ 31] = *(a01 + 31);
+
+	    a01 += lda;
+	    a02 += lda;
+	    a03 += lda;
+	    a04 += lda;
+	    a05 += lda;
+	    a06 += lda;
+	    a07 += lda;
+	    a08 += lda;
+		a09 += lda;
+	    a10 += lda;
+	    a11 += lda;
+	    a12 += lda;
+	    a13 += lda;
+	    a14 += lda;
+	    a15 += lda;
+	    a16 += lda;
+	    b += 32;
+	  }
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+		  b[ 16] = *(a01 + 16);
+	      b[ 17] = *(a01 + 17);
+	      b[ 18] = *(a01 + 18);
+	      b[ 19] = *(a01 + 19);
+	      b[ 20] = *(a01 + 20);
+	      b[ 21] = *(a01 + 21);
+	      b[ 22] = *(a01 + 22);
+	      b[ 23] = *(a01 + 23);
+	      b[ 24] = *(a01 + 24);
+	      b[ 25] = *(a01 + 25);
+	      b[ 26] = *(a01 + 26);
+	      b[ 27] = *(a01 + 27);
+	      b[ 28] = *(a01 + 28);
+	      b[ 29] = *(a01 + 29);
+		  b[ 30] = *(a01 + 30);
+	      b[ 31] = *(a01 + 31);
+	      b += 32;
+
+	      if (i >= 2) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+		b[  4] = *(a02 +  4);
+		b[  5] = *(a02 +  5);
+		b[  6] = *(a02 +  6);
+		b[  7] = *(a02 +  7);
+		b[  8] = *(a02 +  8);
+		b[  9] = *(a02 +  9);
+		b[ 10] = *(a02 + 10);
+		b[ 11] = *(a02 + 11);
+		b[ 12] = *(a02 + 12);
+		b[ 13] = *(a02 + 13);
+		b[ 14] = *(a02 + 14);
+		b[ 15] = *(a02 + 15);
+		b[ 16] = *(a02 + 16);
+		b[ 17] = *(a02 + 17);
+		b[ 18] = *(a02 + 18);
+		b[ 19] = *(a02 + 19);
+		b[ 20] = *(a02 + 20);
+		b[ 21] = *(a02 + 21);
+		b[ 22] = *(a02 + 22);
+		b[ 23] = *(a02 + 23);
+		b[ 24] = *(a02 + 24);
+		b[ 25] = *(a02 + 25);
+		b[ 26] = *(a02 + 26);
+		b[ 27] = *(a02 + 27);
+		b[ 28] = *(a02 + 28);
+		b[ 29] = *(a02 + 29);
+		b[ 30] = *(a02 + 30);
+		b[ 31] = *(a02 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 3) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[  6] = *(a03 +  6);
+		b[  7] = *(a03 +  7);
+		b[  8] = *(a03 +  8);
+		b[  9] = *(a03 +  9);
+		b[ 10] = *(a03 + 10);
+		b[ 11] = *(a03 + 11);
+		b[ 12] = *(a03 + 12);
+		b[ 13] = *(a03 + 13);
+		b[ 14] = *(a03 + 14);
+		b[ 15] = *(a03 + 15);
+		b[ 16] = *(a03 + 16);
+		b[ 17] = *(a03 + 17);
+		b[ 18] = *(a03 + 18);
+		b[ 19] = *(a03 + 19);
+		b[ 20] = *(a03 + 20);
+		b[ 21] = *(a03 + 21);
+		b[ 22] = *(a03 + 22);
+		b[ 23] = *(a03 + 23);
+		b[ 24] = *(a03 + 24);
+		b[ 25] = *(a03 + 25);
+		b[ 26] = *(a03 + 26);
+		b[ 27] = *(a03 + 27);
+		b[ 28] = *(a03 + 28);
+		b[ 29] = *(a03 + 29);
+		b[ 30] = *(a03 + 30);
+		b[ 31] = *(a03 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 4) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[  8] = *(a04 +  8);
+		b[  9] = *(a04 +  9);
+		b[ 10] = *(a04 + 10);
+		b[ 11] = *(a04 + 11);
+		b[ 12] = *(a04 + 12);
+		b[ 13] = *(a04 + 13);
+		b[ 14] = *(a04 + 14);
+		b[ 15] = *(a04 + 15);
+		b[ 16] = *(a04 + 16);
+		b[ 17] = *(a04 + 17);
+		b[ 18] = *(a04 + 18);
+		b[ 19] = *(a04 + 19);
+		b[ 20] = *(a04 + 20);
+		b[ 21] = *(a04 + 21);
+		b[ 22] = *(a04 + 22);
+		b[ 23] = *(a04 + 23);
+		b[ 24] = *(a04 + 24);
+		b[ 25] = *(a04 + 25);
+		b[ 26] = *(a04 + 26);
+		b[ 27] = *(a04 + 27);
+		b[ 28] = *(a04 + 28);
+		b[ 29] = *(a04 + 29);
+		b[ 30] = *(a04 + 30);
+		b[ 31] = *(a04 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 5) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[ 10] = *(a05 + 10);
+		b[ 11] = *(a05 + 11);
+		b[ 12] = *(a05 + 12);
+		b[ 13] = *(a05 + 13);
+		b[ 14] = *(a05 + 14);
+		b[ 15] = *(a05 + 15);
+		b[ 16] = *(a05 + 16);
+		b[ 17] = *(a05 + 17);
+		b[ 18] = *(a05 + 18);
+		b[ 19] = *(a05 + 19);
+		b[ 20] = *(a05 + 20);
+		b[ 21] = *(a05 + 21);
+		b[ 22] = *(a05 + 22);
+		b[ 23] = *(a05 + 23);
+		b[ 24] = *(a05 + 24);
+		b[ 25] = *(a05 + 25);
+		b[ 26] = *(a05 + 26);
+		b[ 27] = *(a05 + 27);
+		b[ 28] = *(a05 + 28);
+		b[ 29] = *(a05 + 29);
+		b[ 30] = *(a05 + 30);
+		b[ 31] = *(a05 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 6) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[ 12] = *(a06 + 12);
+		b[ 13] = *(a06 + 13);
+		b[ 14] = *(a06 + 14);
+		b[ 15] = *(a06 + 15);
+		b[ 16] = *(a06 + 16);
+		b[ 17] = *(a06 + 17);
+		b[ 18] = *(a06 + 18);
+		b[ 19] = *(a06 + 19);
+		b[ 20] = *(a06 + 20);
+		b[ 21] = *(a06 + 21);
+		b[ 22] = *(a06 + 22);
+		b[ 23] = *(a06 + 23);
+		b[ 24] = *(a06 + 24);
+		b[ 25] = *(a06 + 25);
+		b[ 26] = *(a06 + 26);
+		b[ 27] = *(a06 + 27);
+		b[ 28] = *(a06 + 28);
+		b[ 29] = *(a06 + 29);
+		b[ 30] = *(a06 + 30);
+		b[ 31] = *(a06 + 31);
+		b += 32;
+	      }
+
+	      if (i >= 7) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[ 14] = *(a07 + 14);
+		b[ 15] = *(a07 + 15);
+		b[ 16] = *(a07 + 16);
+		b[ 17] = *(a07 + 17);
+		b[ 18] = *(a07 + 18);
+		b[ 19] = *(a07 + 19);
+		b[ 20] = *(a07 + 20);
+		b[ 21] = *(a07 + 21);
+		b[ 22] = *(a07 + 22);
+		b[ 23] = *(a07 + 23);
+		b[ 24] = *(a07 + 24);
+		b[ 25] = *(a07 + 25);
+		b[ 26] = *(a07 + 26);
+		b[ 27] = *(a07 + 27);
+		b[ 28] = *(a07 + 28);
+		b[ 29] = *(a07 + 29);
+		b[ 30] = *(a07 + 30);
+		b[ 31] = *(a07 + 31);
+		b += 32;
+	    }
+
+		if (i >= 8) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+#ifdef UNIT
+	      b[ 14] = ONE;
+	      b[ 15] = ZERO;
+#else
+	      b[ 14] = *(a08 +  14);
+	      b[ 15] = *(a08 +  15);
+#endif
+		b[ 16] = *(a08 + 16);
+		b[ 17] = *(a08 + 17);
+		b[ 18] = *(a08 + 18);
+		b[ 19] = *(a08 + 19);
+		b[ 20] = *(a08 + 20);
+		b[ 21] = *(a08 + 21);
+		b[ 22] = *(a08 + 22);
+		b[ 23] = *(a08 + 23);
+		b[ 24] = *(a08 + 24);
+		b[ 25] = *(a08 + 25);
+		b[ 26] = *(a08 + 26);
+		b[ 27] = *(a08 + 27);
+		b[ 28] = *(a08 + 28);
+		b[ 29] = *(a08 + 29);
+		b[ 30] = *(a08 + 30);
+		b[ 31] = *(a08 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 9) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+#ifdef UNIT
+	      b[ 16] = ONE;
+	      b[ 17] = ZERO;
+#else
+	      b[ 16] = *(a09 + 16);
+	      b[ 17] = *(a09 + 17);
+#endif
+	    b[ 18] = *(a09 + 18);
+		b[ 19] = *(a09 + 19);
+		b[ 20] = *(a09 + 20);
+		b[ 21] = *(a09 + 21);
+		b[ 22] = *(a09 + 22);
+		b[ 23] = *(a09 + 23);
+		b[ 24] = *(a09 + 24);
+		b[ 25] = *(a09 + 25);
+		b[ 26] = *(a09 + 26);
+		b[ 27] = *(a09 + 27);
+		b[ 28] = *(a09 + 28);
+		b[ 29] = *(a09 + 29);
+		b[ 30] = *(a09 + 30);
+		b[ 31] = *(a09 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 10) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a10 + 18);
+	      b[ 19] = *(a10 + 19);
+#endif
+	    b[ 20] = *(a10 + 20);
+		b[ 21] = *(a10 + 21);
+		b[ 22] = *(a10 + 22);
+		b[ 23] = *(a10 + 23);
+		b[ 24] = *(a10 + 24);
+		b[ 25] = *(a10 + 25);
+		b[ 26] = *(a10 + 26);
+		b[ 27] = *(a10 + 27);
+		b[ 28] = *(a10 + 28);
+		b[ 29] = *(a10 + 29);
+		b[ 30] = *(a10 + 30);
+		b[ 31] = *(a10 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 11) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a11 + 20);
+	      b[ 21] = *(a11 + 21);
+#endif
+	    b[ 22] = *(a11 + 22);
+		b[ 23] = *(a11 + 23);
+		b[ 24] = *(a11 + 24);
+		b[ 25] = *(a11 + 25);
+		b[ 26] = *(a11 + 26);
+		b[ 27] = *(a11 + 27);
+		b[ 28] = *(a11 + 28);
+		b[ 29] = *(a11 + 29);
+		b[ 30] = *(a11 + 30);
+		b[ 31] = *(a11 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 12) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+#ifdef UNIT
+	      b[ 22] = ONE;
+	      b[ 23] = ZERO;
+#else
+	      b[ 22] = *(a12 + 22);
+	      b[ 23] = *(a12 + 23);
+#endif
+	    b[ 24] = *(a12 + 24);
+		b[ 25] = *(a12 + 25);
+		b[ 26] = *(a12 + 26);
+		b[ 27] = *(a12 + 27);
+		b[ 28] = *(a12 + 28);
+		b[ 29] = *(a12 + 29);
+		b[ 30] = *(a12 + 30);
+		b[ 31] = *(a12 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 13) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+#ifdef UNIT
+	      b[ 24] = ONE;
+	      b[ 25] = ZERO;
+#else
+	      b[ 24] = *(a13 + 24);
+	      b[ 25] = *(a13 + 25);
+#endif
+	    b[ 26] = *(a13 + 26);
+		b[ 27] = *(a13 + 27);
+		b[ 28] = *(a13 + 28);
+		b[ 29] = *(a12 + 29);
+		b[ 30] = *(a13 + 30);
+		b[ 31] = *(a13 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 14) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+#ifdef UNIT
+	      b[ 26] = ONE;
+	      b[ 27] = ZERO;
+#else
+	      b[ 26] = *(a14 + 26);
+	      b[ 27] = *(a14 + 27);
+#endif
+	    b[ 28] = *(a14 + 28);
+		b[ 29] = *(a14 + 29);
+		b[ 30] = *(a14 + 30);
+		b[ 31] = *(a14 + 31);
+	      b += 32;
+	    }
+
+		if (i >= 15) {
+	      b[  0] = ZERO;
+	      b[  1] = ZERO;
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+#ifdef UNIT
+	      b[ 28] = ONE;
+	      b[ 29] = ZERO;
+#else
+	      b[ 28] = *(a15 + 28);
+	      b[ 29] = *(a15 + 29);
+#endif
+	      b[ 30] = *(a15 + 30);
+		  b[ 31] = *(a15 + 31);
+	      b += 32;
+	    }
+	  }
+      }
+
+      posY += 16;
+      js --;
+    } while (js > 0);
+  } /* End of main loop */
+
+
+  if (n & 8){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posY * 2 + (posX +  0) * lda;
+	a02 = a + posY * 2 + (posX +  1) * lda;
+	a03 = a + posY * 2 + (posX +  2) * lda;
+	a04 = a + posY * 2 + (posX +  3) * lda;
+	a05 = a + posY * 2 + (posX +  4) * lda;
+	a06 = a + posY * 2 + (posX +  5) * lda;
+	a07 = a + posY * 2 + (posX +  6) * lda;
+	a08 = a + posY * 2 + (posX +  7) * lda;
+      } else {
+	a01 = a + posX * 2 + (posY +  0) * lda;
+	a02 = a + posX * 2 + (posY +  1) * lda;
+	a03 = a + posX * 2 + (posY +  2) * lda;
+	a04 = a + posX * 2 + (posY +  3) * lda;
+	a05 = a + posX * 2 + (posY +  4) * lda;
+	a06 = a + posX * 2 + (posY +  5) * lda;
+	a07 = a + posX * 2 + (posY +  6) * lda;
+	a08 = a + posX * 2 + (posY +  7) * lda;
+      }
+
+      i = (m >> 3);
+      if (i > 0) {
+	do {
+	  if (X > posY) {
+	    a01 += 16;
+	    a02 += 16;
+	    a03 += 16;
+	    a04 += 16;
+	    a05 += 16;
+	    a06 += 16;
+	    a07 += 16;
+	    a08 += 16;
+	    b += 128;
+	  } else
+	    if (X < posY) {
+	      for (ii = 0; ii < 8; ii++){
+
+		b[  0] = *(a01 +  0);
+		b[  1] = *(a01 +  1);
+		b[  2] = *(a01 +  2);
+		b[  3] = *(a01 +  3);
+		b[  4] = *(a01 +  4);
+		b[  5] = *(a01 +  5);
+		b[  6] = *(a01 +  6);
+		b[  7] = *(a01 +  7);
+
+		b[  8] = *(a01 +  8);
+		b[  9] = *(a01 +  9);
+		b[ 10] = *(a01 + 10);
+		b[ 11] = *(a01 + 11);
+		b[ 12] = *(a01 + 12);
+		b[ 13] = *(a01 + 13);
+		b[ 14] = *(a01 + 14);
+		b[ 15] = *(a01 + 15);
+
+		a01 += lda;
+		b += 16;
+	      }
+		  a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a02 +  2);
+	      b[ 19] = *(a02 +  3);
+#endif
+	      b[ 20] = *(a02 +  4);
+	      b[ 21] = *(a02 +  5);
+	      b[ 22] = *(a02 +  6);
+	      b[ 23] = *(a02 +  7);
+	      b[ 24] = *(a02 +  8);
+	      b[ 25] = *(a02 +  9);
+	      b[ 26] = *(a02 + 10);
+	      b[ 27] = *(a02 + 11);
+	      b[ 28] = *(a02 + 12);
+	      b[ 29] = *(a02 + 13);
+	      b[ 30] = *(a02 + 14);
+	      b[ 31] = *(a02 + 15);
+
+	      b[ 32] = ZERO;
+	      b[ 33] = ZERO;
+	      b[ 34] = ZERO;
+	      b[ 35] = ZERO;
+#ifdef UNIT
+	      b[ 36] = ONE;
+	      b[ 37] = ZERO;
+#else
+	      b[ 36] = *(a03 +  4);
+	      b[ 37] = *(a03 +  5);
+#endif
+	      b[ 38] = *(a03 +  6);
+	      b[ 39] = *(a03 +  7);
+	      b[ 40] = *(a03 +  8);
+	      b[ 41] = *(a03 +  9);
+	      b[ 42] = *(a03 + 10);
+	      b[ 43] = *(a03 + 11);
+	      b[ 44] = *(a03 + 12);
+	      b[ 45] = *(a03 + 13);
+	      b[ 46] = *(a03 + 14);
+	      b[ 47] = *(a03 + 15);
+
+	      b[ 48] = ZERO;
+	      b[ 49] = ZERO;
+	      b[ 50] = ZERO;
+	      b[ 51] = ZERO;
+	      b[ 52] = ZERO;
+	      b[ 53] = ZERO;
+#ifdef UNIT
+	      b[ 54] = ONE;
+	      b[ 55] = ZERO;
+#else
+	      b[ 54] = *(a04 +  6);
+	      b[ 55] = *(a04 +  7);
+#endif
+	      b[ 56] = *(a04 +  8);
+	      b[ 57] = *(a04 +  9);
+	      b[ 58] = *(a04 + 10);
+	      b[ 59] = *(a04 + 11);
+	      b[ 60] = *(a04 + 12);
+	      b[ 61] = *(a04 + 13);
+	      b[ 62] = *(a04 + 14);
+	      b[ 63] = *(a04 + 15);
+
+	      b[ 64] = ZERO;
+	      b[ 65] = ZERO;
+	      b[ 66] = ZERO;
+	      b[ 67] = ZERO;
+	      b[ 68] = ZERO;
+	      b[ 69] = ZERO;
+	      b[ 70] = ZERO;
+	      b[ 71] = ZERO;
+#ifdef UNIT
+	      b[ 72] = ONE;
+	      b[ 73] = ZERO;
+#else
+	      b[ 72] = *(a05 +  8);
+	      b[ 73] = *(a05 +  9);
+#endif
+	      b[ 74] = *(a05 + 10);
+	      b[ 75] = *(a05 + 11);
+	      b[ 76] = *(a05 + 12);
+	      b[ 77] = *(a05 + 13);
+	      b[ 78] = *(a05 + 14);
+	      b[ 79] = *(a05 + 15);
+
+	      b[ 80] = ZERO;
+	      b[ 81] = ZERO;
+	      b[ 82] = ZERO;
+	      b[ 83] = ZERO;
+	      b[ 84] = ZERO;
+	      b[ 85] = ZERO;
+	      b[ 86] = ZERO;
+	      b[ 87] = ZERO;
+	      b[ 88] = ZERO;
+	      b[ 89] = ZERO;
+#ifdef UNIT
+	      b[ 90] = ONE;
+	      b[ 91] = ZERO;
+#else
+	      b[ 90] = *(a06 + 10);
+	      b[ 91] = *(a06 + 11);
+#endif
+	      b[ 92] = *(a06 + 12);
+	      b[ 93] = *(a06 + 13);
+	      b[ 94] = *(a06 + 14);
+	      b[ 95] = *(a06 + 15);
+
+	      b[ 96] = ZERO;
+	      b[ 97] = ZERO;
+	      b[ 98] = ZERO;
+	      b[ 99] = ZERO;
+	      b[100] = ZERO;
+	      b[101] = ZERO;
+	      b[102] = ZERO;
+	      b[103] = ZERO;
+	      b[104] = ZERO;
+	      b[105] = ZERO;
+	      b[106] = ZERO;
+	      b[107] = ZERO;
+#ifdef UNIT
+	      b[108] = ONE;
+	      b[109] = ZERO;
+#else
+	      b[108] = *(a07 + 12);
+	      b[109] = *(a07 + 13);
+#endif
+	      b[110] = *(a07 + 14);
+	      b[111] = *(a07 + 15);
+
+	      b[112] = ZERO;
+	      b[113] = ZERO;
+	      b[114] = ZERO;
+	      b[115] = ZERO;
+	      b[116] = ZERO;
+	      b[117] = ZERO;
+	      b[118] = ZERO;
+	      b[119] = ZERO;
+	      b[120] = ZERO;
+	      b[121] = ZERO;
+	      b[122] = ZERO;
+	      b[123] = ZERO;
+	      b[124] = ZERO;
+	      b[125] = ZERO;
+#ifdef UNIT
+	      b[126] = ONE;
+	      b[127] = ZERO;
+#else
+	      b[126] = *(a08 + 14);
+	      b[127] = *(a08 + 15);
+#endif
+
+	      a01 += 16;
+	      a02 += 16;
+	      a03 += 16;
+	      a04 += 16;
+	      a05 += 16;
+	      a06 += 16;
+	      a07 += 16;
+	      a08 += 16;
+	      b += 128;
+	    }
+
+	  X += 8;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 7);
+      if (i) {
+
+	if (X > posY) {
+	  /* a01 += 2 * i;
+	  a02 += 2 * i;
+	  a03 += 2 * i;
+	  a04 += 2 * i;
+	  a05 += 2 * i;
+	  a06 += 2 * i;
+	  a07 += 2 * i;
+	  a08 += 2 * i; */
+	  b += 16 * i;
+	} else
+	  if (X < posY) {
+	      for (ii = 0; ii < i; ii++){
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+	      a01 += lda;
+	      a02 += lda;
+	      a03 += lda;
+	      a04 += lda;
+	      a05 += lda;
+	      a06 += lda;
+	      a07 += lda;
+	      a08 += lda;
+	      b += 16;
+	    }
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = *(a01 +  2);
+	    b[  3] = *(a01 +  3);
+	    b[  4] = *(a01 +  4);
+	    b[  5] = *(a01 +  5);
+	    b[  6] = *(a01 +  6);
+	    b[  7] = *(a01 +  7);
+
+	    b[  8] = *(a01 +  8);
+	    b[  9] = *(a01 +  9);
+	    b[ 10] = *(a01 + 10);
+	    b[ 11] = *(a01 + 11);
+	    b[ 12] = *(a01 + 12);
+	    b[ 13] = *(a01 + 13);
+	    b[ 14] = *(a01 + 14);
+	    b[ 15] = *(a01 + 15);
+	    b += 16;
+
+	    if (i >= 2) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = *(a02 +  4);
+	      b[ 5] = *(a02 +  5);
+	      b[ 6] = *(a02 +  6);
+	      b[ 7] = *(a02 +  7);
+
+	      b[ 8] = *(a02 +  8);
+	      b[ 9] = *(a02 +  9);
+	      b[10] = *(a02 + 10);
+	      b[11] = *(a02 + 11);
+	      b[12] = *(a02 + 12);
+	      b[13] = *(a02 + 13);
+	      b[14] = *(a02 + 14);
+	      b[15] = *(a02 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = *(a03 +  6);
+	      b[ 7] = *(a03 +  7);
+
+	      b[ 8] = *(a03 +  8);
+	      b[ 9] = *(a03 +  9);
+	      b[10] = *(a03 + 10);
+	      b[11] = *(a03 + 11);
+	      b[12] = *(a03 + 12);
+	      b[13] = *(a03 + 13);
+	      b[14] = *(a03 + 14);
+	      b[15] = *(a03 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 4) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+#ifdef UNIT
+	      b[ 6] = ONE;
+	      b[ 7] = ZERO;
+#else
+	      b[ 6] = *(a04 +  6);
+	      b[ 7] = *(a04 +  7);
+#endif
+
+	      b[ 8] = *(a04 +  8);
+	      b[ 9] = *(a04 +  9);
+	      b[10] = *(a04 + 10);
+	      b[11] = *(a04 + 11);
+	      b[12] = *(a04 + 12);
+	      b[13] = *(a04 + 13);
+	      b[14] = *(a04 + 14);
+	      b[15] = *(a04 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 5) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+
+#ifdef UNIT
+	      b[ 8] = ONE;
+	      b[ 9] = ZERO;
+#else
+	      b[ 8] = *(a05 +  8);
+	      b[ 9] = *(a05 +  9);
+#endif
+	      b[10] = *(a05 + 10);
+	      b[11] = *(a05 + 11);
+	      b[12] = *(a05 + 12);
+	      b[13] = *(a05 + 13);
+	      b[14] = *(a05 + 14);
+	      b[15] = *(a05 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 6) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+#ifdef UNIT
+	      b[10] = ONE;
+	      b[11] = ZERO;
+#else
+	      b[10] = *(a06 + 10);
+	      b[11] = *(a06 + 11);
+#endif
+	      b[12] = *(a06 + 12);
+	      b[13] = *(a06 + 13);
+	      b[14] = *(a06 + 14);
+	      b[15] = *(a06 + 15);
+	      b += 16;
+	    }
+
+	    if (i >= 7) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+#ifdef UNIT
+	      b[12] = ONE;
+	      b[13] = ZERO;
+#else
+	      b[12] = *(a07 + 12);
+	      b[13] = *(a07 + 13);
+#endif
+	      b[14] = *(a07 + 14);
+	      b[15] = *(a07 + 15);
+	      b += 16;
+	    }
+	  }
+      }
+
+      posY += 8;
+  }
+
+
+  if (n & 4){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+      a02 = a + posY * 2 + (posX +  1) * lda;
+      a03 = a + posY * 2 + (posX +  2) * lda;
+      a04 = a + posY * 2 + (posX +  3) * lda;
+    } else {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+      a02 = a + posX * 2 + (posY +  1) * lda;
+      a03 = a + posX * 2 + (posY +  2) * lda;
+      a04 = a + posX * 2 + (posY +  3) * lda;
+    }
+
+    i = (m >> 2);
+    if (i > 0) {
+      do {
+	if (X > posY) {
+	  a01 += 8;
+	  a02 += 8;
+	  a03 += 8;
+	  a04 += 8;
+	  b += 32;
+	} else
+	  if (X < posY) {
+	    for (ii = 0; ii < 4; ii++){
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      a01 += lda;
+	      b += 8;
+	      }
+
+	    a02 += 4 * lda;
+	    a03 += 4 * lda;
+	    a04 += 4 * lda;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+#ifdef UNIT
+	      b[ 10] = ONE;
+	      b[ 11] = ZERO;
+#else
+	      b[ 10] = *(a02 +  2);
+	      b[ 11] = *(a02 +  3);
+#endif
+	      b[ 12] = *(a02 +  4);
+	      b[ 13] = *(a02 +  5);
+	      b[ 14] = *(a02 +  6);
+	      b[ 15] = *(a02 +  7);
+
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a03 +  4);
+	      b[ 21] = *(a03 +  5);
+#endif
+	      b[ 22] = *(a03 +  6);
+	      b[ 23] = *(a03 +  7);
+
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+#ifdef UNIT
+	      b[ 30] = ONE;
+	      b[ 31] = ZERO;
+#else
+	      b[ 30] = *(a04 +  6);
+	      b[ 31] = *(a04 +  7);
+#endif
+
+	      a01 += 8;
+	      a02 += 8;
+	      a03 += 8;
+	      a04 += 8;
+	      b += 32;
+	    }
+
+	X += 4;
+	i --;
+      } while (i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0) {
+      if (X > posY) {
+	/* a01 += 2 * i;
+	a02 += 2 * i;
+	a03 += 2 * i;
+	a04 += 2 * i; */
+	b += 8 * i;
+      } else
+	if (X < posY) {
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a01 +  2);
+	    b[  3] = *(a01 +  3);
+	    b[  4] = *(a01 +  4);
+	    b[  5] = *(a01 +  5);
+	    b[  6] = *(a01 +  6);
+	    b[  7] = *(a01 +  7);
+
+	    a01 += lda;
+	    a02 += lda;
+	    a03 += lda;
+	    a04 += lda;
+	    b += 8;
+	  }
+	} else {
+#ifdef UNIT
+	  b[  0] = ONE;
+	  b[  1] = ZERO;
+#else
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+#endif
+	  b[  2] = *(a01 +  2);
+	  b[  3] = *(a01 +  3);
+	  b[  4] = *(a01 +  4);
+	  b[  5] = *(a01 +  5);
+	  b[  6] = *(a01 +  6);
+	  b[  7] = *(a01 +  7);
+	  b += 8;
+
+	  if (i >= 2) {
+	    b[ 0] = ZERO;
+	    b[ 1] = ZERO;
+#ifdef UNIT
+	    b[ 2] = ONE;
+	    b[ 3] = ZERO;
+#else
+	    b[ 2] = *(a02 +  2);
+	    b[ 3] = *(a02 +  3);
+#endif
+	    b[ 4] = *(a02 +  4);
+	    b[ 5] = *(a02 +  5);
+	    b[ 6] = *(a02 +  6);
+	    b[ 7] = *(a02 +  7);
+	    b += 8;
+	  }
+
+	  if (i >= 3) {
+	    b[ 0] = ZERO;
+	    b[ 1] = ZERO;
+	    b[ 2] = ZERO;
+	    b[ 3] = ZERO;
+#ifdef UNIT
+	    b[ 4] = ONE;
+	    b[ 5] = ZERO;
+#else
+	    b[ 4] = *(a03 +  4);
+	    b[ 5] = *(a03 +  5);
+#endif
+	    b[ 6] = *(a03 +  6);
+	    b[ 7] = *(a03 +  7);
+	    b += 8;
+	  }
+	}
+    }
+    posY += 4;
+  }
+
+  if (n & 2){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+      a02 = a + posY * 2 + (posX +  1) * lda;
+    } else {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+      a02 = a + posX * 2 + (posY +  1) * lda;
+    }
+
+    i = (m >> 1);
+    if (i > 0) {
+      do {
+	if (X > posY) {
+	  a01 += 4;
+	  a02 += 4;
+	  b += 8;
+	} else
+	  if (X < posY) {
+	    b[0] = *(a01 +  0);
+	    b[1] = *(a01 +  1);
+	    b[2] = *(a01 +  2);
+	    b[3] = *(a01 +  3);
+	    b[4] = *(a02 +  0);
+	    b[5] = *(a02 +  1);
+	    b[6] = *(a02 +  2);
+	    b[7] = *(a02 +  3);
+	    a01 += 2 * lda;
+	    a02 += 2 * lda;
+	    b += 8;
+	  } else {
+#ifdef UNIT
+	    b[0] = ONE;
+	    b[1] = ZERO;
+#else
+	    b[0] = *(a01 +  0);
+	    b[1] = *(a01 +  1);
+#endif
+	    b[2] = *(a01 +  2);
+	    b[3] = *(a01 +  3);
+
+	    b[4] = ZERO;
+	    b[5] = ZERO;
+#ifdef UNIT
+	    b[6] = ONE;
+	    b[7] = ZERO;
+#else
+	    b[6] = *(a02 +  2);
+	    b[7] = *(a02 +  3);
+#endif
+	    a01 += 4;
+	    a02 += 4;
+	    b += 8;
+	  }
+
+	X += 2;
+	i --;
+      } while (i > 0);
+    }
+
+    i = (m & 1);
+    if (i > 0) {
+      if (X > posY) {
+	/* a01 += 2;
+	a02 += 2; */
+	b += 4;
+      } else
+	if (X < posY) {
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+	  b[  2] = *(a01 +  2);
+	  b[  3] = *(a01 +  3);
+
+	  /* a01 += lda;
+	  a02 += lda; */
+	  b += 4;
+	} else {
+#ifdef UNIT
+	  b[  0] = ONE;
+	  b[  1] = ZERO;
+#else
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+#endif
+	  b[  2] = *(a01 +  2);
+	  b[  3] = *(a01 +  3);
+	  b += 4;
+	}
+    }
+    posY += 2;
+  }
+
+  if (n & 1){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+    } else {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+    }
+
+    i = m;
+    if (i > 0) {
+      do {
+
+	if (X > posY) {
+	  a01 += 2;
+	  b += 2;
+	} else
+	  if (X < posY) {
+	    b[0] = *(a01 + 0);
+	    b[1] = *(a01 + 1);
+	    a01 += lda;
+	    b += 2;
+	  } else {
+#ifdef UNIT
+	    b[0] = ONE;
+	    b[1] = ZERO;
+#else
+	    b[0] = *(a01 + 0);
+	    b[1] = *(a01 + 1);
+#endif
+	    a01 += 2;
+	    b += 2;
+	  }
+
+	X += 1;
+	i --;
+      } while (i > 0);
+    }
+    // posY += 1;
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrmm_uncopy_16.c b/kernel/generic/ztrmm_uncopy_16.c
new file mode 100644
index 000000000..40b85db38
--- /dev/null
+++ b/kernel/generic/ztrmm_uncopy_16.c
@@ -0,0 +1,2316 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js;
+  BLASLONG X, ii;
+
+  FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08;
+  FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  lda += lda;
+
+  js = (n >> 4);
+
+  if (js > 0){
+    do {
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+	a09 = a + posX * 2 + (posY + 8) * lda;
+	a10 = a + posX * 2 + (posY + 9) * lda;
+	a11 = a + posX * 2 + (posY + 10) * lda;
+	a12 = a + posX * 2 + (posY + 11) * lda;
+	a13 = a + posX * 2 + (posY + 12) * lda;
+	a14 = a + posX * 2 + (posY + 13) * lda;
+	a15 = a + posX * 2 + (posY + 14) * lda;
+	a16 = a + posX * 2 + (posY + 15) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+	a09 = a + posY * 2 + (posX + 8) * lda;
+	a10 = a + posY * 2 + (posX + 9) * lda;
+	a11 = a + posY * 2 + (posX + 10) * lda;
+	a12 = a + posY * 2 + (posX + 11) * lda;
+	a13 = a + posY * 2 + (posX + 12) * lda;
+	a14 = a + posY * 2 + (posX + 13) * lda;
+	a15 = a + posY * 2 + (posX + 14) * lda;
+	a16 = a + posY * 2 + (posX + 15) * lda;
+      }
+
+      i = (m >> 4);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	    for (ii = 0; ii < 16; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+
+		  b[ 16] = *(a09 +  0);
+	      b[ 17] = *(a09 +  1);
+	      b[ 18] = *(a10 +  0);
+	      b[ 19] = *(a10 +  1);
+	      b[ 20] = *(a11 +  0);
+	      b[ 21] = *(a11 +  1);
+	      b[ 22] = *(a12 +  0);
+	      b[ 23] = *(a12 +  1);
+
+	      b[ 24] = *(a13 +  0);
+	      b[ 25] = *(a13 +  1);
+	      b[ 26] = *(a14 +  0);
+	      b[ 27] = *(a14 +  1);
+	      b[ 28] = *(a15 +  0);
+	      b[ 29] = *(a15 +  1);
+	      b[ 30] = *(a16 +  0);
+	      b[ 31] = *(a16 +  1);
+
+	      a01 += 2;
+	      a02 += 2;
+	      a03 += 2;
+	      a04 += 2;
+	      a05 += 2;
+	      a06 += 2;
+	      a07 += 2;
+	      a08 += 2;
+		  a09 += 2;
+	      a10 += 2;
+	      a11 += 2;
+	      a12 += 2;
+	      a13 += 2;
+	      a14 += 2;
+	      a15 += 2;
+	      a16 += 2;
+	      b += 32;
+	    }
+	  } else
+	    if (X > posY) {
+	      a01 += 16 * lda;
+	      a02 += 16 * lda;
+	      a03 += 16 * lda;
+	      a04 += 16 * lda;
+	      a05 += 16 * lda;
+	      a06 += 16 * lda;
+	      a07 += 16 * lda;
+	      a08 += 16 * lda;
+		  a09 += 16 * lda;
+	      a10 += 16 * lda;
+	      a11 += 16 * lda;
+	      a12 += 16 * lda;
+	      a13 += 16 * lda;
+	      a14 += 16 * lda;
+	      a15 += 16 * lda;
+	      a16 += 16 * lda;
+
+	      b += 512;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+		  b[ 16] = *(a09 +  0);
+	      b[ 17] = *(a09 +  1);
+	      b[ 18] = *(a10 +  0);
+	      b[ 19] = *(a10 +  1);
+	      b[ 20] = *(a11 +  0);
+	      b[ 21] = *(a11 +  1);
+	      b[ 22] = *(a12 +  0);
+	      b[ 23] = *(a12 +  1);
+	      b[ 24] = *(a13 +  0);
+	      b[ 25] = *(a13 +  1);
+	      b[ 26] = *(a14 +  0);
+	      b[ 27] = *(a14 +  1);
+	      b[ 28] = *(a15 +  0);
+	      b[ 29] = *(a15 +  1);
+		  b[ 30] = *(a16 +  0);
+	      b[ 31] = *(a16 +  1);
+
+	      b[ 32] = ZERO;
+	      b[ 33] = ZERO;
+#ifdef UNIT
+	      b[ 34] = ONE;
+	      b[ 35] = ZERO;
+#else
+	      b[ 34] = *(a02 +  2);
+	      b[ 35] = *(a02 +  3);
+#endif
+	      b[ 36] = *(a03 +  2);
+	      b[ 37] = *(a03 +  3);
+	      b[ 38] = *(a04 +  2);
+	      b[ 39] = *(a04 +  3);
+	      b[ 40] = *(a05 +  2);
+	      b[ 41] = *(a05 +  3);
+	      b[ 42] = *(a06 +  2);
+	      b[ 43] = *(a06 +  3);
+	      b[ 44] = *(a07 +  2);
+	      b[ 45] = *(a07 +  3);
+	      b[ 46] = *(a08 +  2);
+	      b[ 47] = *(a08 +  3);
+	      b[ 48] = *(a09 +  2);
+	      b[ 49] = *(a09 +  3);
+	      b[ 50] = *(a10 +  2);
+	      b[ 51] = *(a10 +  3);
+	      b[ 52] = *(a11 +  2);
+	      b[ 53] = *(a11 +  3);
+	      b[ 54] = *(a12 +  2);
+	      b[ 55] = *(a12 +  3);
+	      b[ 56] = *(a13 +  2);
+	      b[ 57] = *(a13 +  3);
+	      b[ 58] = *(a14 +  2);
+	      b[ 59] = *(a14 +  3);
+	      b[ 60] = *(a15 +  2);
+	      b[ 61] = *(a15 +  3);
+	      b[ 62] = *(a16 +  2);
+	      b[ 63] = *(a16 +  3);
+
+	      b[ 64] = ZERO;
+	      b[ 65] = ZERO;
+	      b[ 66] = ZERO;
+	      b[ 67] = ZERO;
+#ifdef UNIT
+	      b[ 68] = ONE;
+	      b[ 69] = ZERO;
+#else
+	      b[ 68] = *(a03 +  4);
+	      b[ 69] = *(a03 +  5);
+#endif
+	      b[ 70] = *(a04 +  4);
+	      b[ 71] = *(a04 +  5);
+	      b[ 72] = *(a05 +  4);
+	      b[ 73] = *(a05 +  5);
+	      b[ 74] = *(a06 +  4);
+	      b[ 75] = *(a06 +  5);
+	      b[ 76] = *(a07 +  4);
+	      b[ 77] = *(a07 +  5);
+	      b[ 78] = *(a08 +  4);
+	      b[ 79] = *(a08 +  5);
+	      b[ 80] = *(a09 +  4);
+	      b[ 81] = *(a09 +  5);
+	      b[ 82] = *(a10 +  4);
+		  b[ 83] = *(a10 +  5);
+	      b[ 84] = *(a11 +  4);
+	      b[ 85] = *(a11 +  5);
+	      b[ 86] = *(a12 +  4);
+	      b[ 87] = *(a12 +  5);
+	      b[ 88] = *(a13 +  4);
+	      b[ 89] = *(a13 +  5);
+	      b[ 90] = *(a14 +  4);
+	      b[ 91] = *(a14 +  5);
+	      b[ 92] = *(a15 +  4);
+	      b[ 93] = *(a15 +  5);
+	      b[ 94] = *(a16 +  4);
+	      b[ 95] = *(a16 +  5);
+
+	      b[ 96] = ZERO;
+	      b[ 97] = ZERO;
+	      b[ 98] = ZERO;
+	      b[ 99] = ZERO;
+	      b[100] = ZERO;
+	      b[101] = ZERO;
+#ifdef UNIT
+	      b[102] = ONE;
+	      b[103] = ZERO;
+#else
+	      b[102] = *(a04 +  6);
+	      b[103] = *(a04 +  7);
+#endif
+	      b[104] = *(a05 +  6);
+	      b[105] = *(a05 +  7);
+	      b[106] = *(a06 +  6);
+	      b[107] = *(a06 +  7);
+	      b[108] = *(a07 +  6);
+	      b[109] = *(a07 +  7);
+	      b[110] = *(a08 +  6);
+	      b[111] = *(a08 +  7);
+	      b[112] = *(a09 +  6);
+	      b[113] = *(a09 +  7);
+	      b[114] = *(a10 +  6);
+	      b[115] = *(a10 +  7);
+		  b[116] = *(a11 +  6);
+	      b[117] = *(a11 +  7);
+	      b[118] = *(a12 +  6);
+	      b[119] = *(a12 +  7);
+	      b[120] = *(a13 +  6);
+	      b[121] = *(a13 +  7);
+	      b[122] = *(a14 +  6);
+	      b[123] = *(a14 +  7);
+	      b[124] = *(a15 +  6);
+	      b[125] = *(a15 +  7);
+	      b[126] = *(a16 +  6);
+	      b[127] = *(a16 +  7);
+
+	      b[128] = ZERO;
+	      b[129] = ZERO;
+	      b[130] = ZERO;
+	      b[131] = ZERO;
+	      b[132] = ZERO;
+	      b[133] = ZERO;
+	      b[134] = ZERO;
+	      b[135] = ZERO;
+#ifdef UNIT
+	      b[136] = ONE;
+	      b[137] = ZERO;
+#else
+	      b[136] = *(a05 +  8);
+	      b[137] = *(a05 +  9);
+#endif
+	      b[138] = *(a06 +  8);
+	      b[139] = *(a06 +  9);
+	      b[140] = *(a07 +  8);
+	      b[141] = *(a07 +  9);
+	      b[142] = *(a08 +  8);
+	      b[143] = *(a08 +  9);
+	      b[144] = *(a09 +  8);
+	      b[145] = *(a09 +  9);
+	      b[146] = *(a10 +  8);
+	      b[147] = *(a10 +  9);
+	      b[148] = *(a11 +  8);
+		  b[149] = *(a11 +  9);
+	      b[150] = *(a12 +  8);
+	      b[151] = *(a12 +  9);
+	      b[152] = *(a13 +  8);
+	      b[153] = *(a13 +  9);
+	      b[154] = *(a14 +  8);
+	      b[155] = *(a14 +  9);
+	      b[156] = *(a15 +  8);
+	      b[157] = *(a15 +  9);
+	      b[158] = *(a16 +  8);
+	      b[159] = *(a16 +  9);
+
+	      b[160] = ZERO;
+	      b[161] = ZERO;
+	      b[162] = ZERO;
+	      b[163] = ZERO;
+	      b[164] = ZERO;
+	      b[165] = ZERO;
+	      b[166] = ZERO;
+	      b[167] = ZERO;
+	      b[168] = ZERO;
+	      b[169] = ZERO;
+#ifdef UNIT
+	      b[170] = ONE;
+	      b[171] = ZERO;
+#else
+	      b[170] = *(a06 + 10);
+	      b[171] = *(a06 + 11);
+#endif
+	      b[172] = *(a07 + 10);
+	      b[173] = *(a07 + 11);
+	      b[174] = *(a08 + 10);
+	      b[175] = *(a08 + 11);
+	      b[176] = *(a09 + 10);
+	      b[177] = *(a09 + 11);
+	      b[178] = *(a10 + 10);
+	      b[179] = *(a10 + 11);
+	      b[180] = *(a11 + 10);
+	      b[181] = *(a11 + 11);
+		  b[182] = *(a12 + 10);
+	      b[183] = *(a12 + 11);
+	      b[184] = *(a13 + 10);
+	      b[185] = *(a13 + 11);
+	      b[186] = *(a14 + 10);
+	      b[187] = *(a14 + 11);
+	      b[188] = *(a15 + 10);
+	      b[189] = *(a15 + 11);
+	      b[190] = *(a16 + 10);
+	      b[191] = *(a16 + 11);
+
+	      b[192] = ZERO;
+	      b[193] = ZERO;
+	      b[194] = ZERO;
+	      b[195] = ZERO;
+	      b[196] = ZERO;
+	      b[197] = ZERO;
+	      b[198] = ZERO;
+	      b[199] = ZERO;
+	      b[200] = ZERO;
+	      b[201] = ZERO;
+	      b[202] = ZERO;
+	      b[203] = ZERO;
+#ifdef UNIT
+	      b[204] = ONE;
+	      b[205] = ZERO;
+#else
+	      b[204] = *(a07 + 12);
+	      b[205] = *(a07 + 13);
+#endif
+	      b[206] = *(a08 + 12);
+	      b[207] = *(a08 + 13);
+	      b[208] = *(a09 + 12);
+	      b[209] = *(a09 + 13);
+	      b[210] = *(a10 + 12);
+	      b[211] = *(a10 + 13);
+	      b[212] = *(a11 + 12);
+	      b[213] = *(a11 + 13);
+	      b[214] = *(a12 + 12);
+		  b[215] = *(a12 + 13);
+	      b[216] = *(a13 + 12);
+	      b[217] = *(a13 + 13);
+	      b[218] = *(a14 + 12);
+	      b[219] = *(a14 + 13);
+	      b[220] = *(a15 + 12);
+	      b[221] = *(a15 + 13);
+	      b[222] = *(a16 + 12);
+	      b[223] = *(a16 + 13);
+
+	      b[224] = ZERO;
+	      b[225] = ZERO;
+	      b[226] = ZERO;
+	      b[227] = ZERO;
+	      b[228] = ZERO;
+	      b[229] = ZERO;
+	      b[230] = ZERO;
+	      b[231] = ZERO;
+	      b[232] = ZERO;
+	      b[233] = ZERO;
+	      b[234] = ZERO;
+	      b[235] = ZERO;
+	      b[236] = ZERO;
+	      b[237] = ZERO;
+#ifdef UNIT
+	      b[238] = ONE;
+	      b[239] = ZERO;
+#else
+	      b[238] = *(a08 + 14);
+	      b[239] = *(a08 + 15);
+#endif
+		  b[240] = *(a09 + 14);
+	      b[241] = *(a09 + 15);
+	      b[242] = *(a10 + 14);
+	      b[243] = *(a10 + 15);
+	      b[244] = *(a11 + 14);
+	      b[245] = *(a11 + 15);
+	      b[246] = *(a12 + 14);
+	      b[247] = *(a12 + 15);
+		  b[248] = *(a13 + 14);
+	      b[249] = *(a13 + 15);
+	      b[250] = *(a14 + 14);
+	      b[251] = *(a14 + 15);
+	      b[252] = *(a15 + 14);
+	      b[253] = *(a15 + 15);
+	      b[254] = *(a16 + 14);
+	      b[255] = *(a16 + 15);
+
+	      b[256] = ZERO;
+	      b[257] = ZERO;
+	      b[258] = ZERO;
+	      b[259] = ZERO;
+	      b[260] = ZERO;
+	      b[261] = ZERO;
+	      b[262] = ZERO;
+	      b[263] = ZERO;
+	      b[264] = ZERO;
+	      b[265] = ZERO;
+	      b[266] = ZERO;
+	      b[267] = ZERO;
+	      b[268] = ZERO;
+	      b[269] = ZERO;
+	      b[270] = ZERO;
+	      b[271] = ZERO;
+#ifdef UNIT
+	      b[272] = ONE;
+		  b[273] = ZERO;
+#else
+	      b[272] = *(a09 + 16);
+		  b[273] = *(a09 + 17);
+#endif
+		  b[274] = *(a10 + 16);
+	      b[275] = *(a10 + 17);
+	      b[276] = *(a11 + 16);
+	      b[277] = *(a11 + 17);
+	      b[278] = *(a12 + 16);
+	      b[279] = *(a12 + 17);
+	      b[280] = *(a13 + 16);
+		  b[281] = *(a13 + 17);
+	      b[282] = *(a14 + 16);
+	      b[283] = *(a14 + 17);
+	      b[284] = *(a15 + 16);
+	      b[285] = *(a15 + 17);
+	      b[286] = *(a16 + 16);
+	      b[287] = *(a16 + 17);
+
+		  b[288] = ZERO;
+		  b[289] = ZERO;
+	      b[290] = ZERO;
+	      b[291] = ZERO;
+	      b[292] = ZERO;
+	      b[293] = ZERO;
+	      b[294] = ZERO;
+	      b[295] = ZERO;
+	      b[296] = ZERO;
+	      b[297] = ZERO;
+	      b[298] = ZERO;
+	      b[299] = ZERO;
+	      b[300] = ZERO;
+	      b[301] = ZERO;
+	      b[302] = ZERO;
+	      b[303] = ZERO;
+	      b[304] = ZERO;
+	      b[305] = ZERO;
+#ifdef UNIT
+	      b[306] = ONE;
+		  b[307] = ZERO;
+#else
+	      b[306] = *(a10 + 18);
+		  b[307] = *(a10 + 19);
+#endif
+		  b[308] = *(a11 + 18);
+	      b[309] = *(a11 + 19);
+	      b[310] = *(a12 + 18);
+	      b[311] = *(a12 + 19);
+	      b[312] = *(a13 + 18);
+	      b[313] = *(a13 + 19);
+		  b[314] = *(a14 + 18);
+	      b[315] = *(a14 + 19);
+	      b[316] = *(a15 + 18);
+	      b[317] = *(a15 + 19);
+	      b[318] = *(a16 + 18);
+	      b[319] = *(a16 + 19);
+
+		  b[320] = ZERO;
+		  b[321] = ZERO;
+	      b[322] = ZERO;
+	      b[323] = ZERO;
+	      b[324] = ZERO;
+	      b[325] = ZERO;
+	      b[326] = ZERO;
+	      b[327] = ZERO;
+	      b[328] = ZERO;
+	      b[329] = ZERO;
+	      b[330] = ZERO;
+	      b[331] = ZERO;
+	      b[332] = ZERO;
+	      b[333] = ZERO;
+	      b[334] = ZERO;
+	      b[335] = ZERO;
+	      b[336] = ZERO;
+	      b[337] = ZERO;
+	      b[338] = ZERO;
+	      b[339] = ZERO;
+#ifdef UNIT
+	      b[340] = ONE;
+	      b[341] = ZERO;
+#else
+	      b[340] = *(a11 + 20);
+	      b[341] = *(a11 + 21);
+#endif
+		  b[342] = *(a12 + 20);
+	      b[343] = *(a12 + 21);
+	      b[344] = *(a13 + 20);
+	      b[345] = *(a13 + 21);
+	      b[346] = *(a14 + 20);
+		  b[347] = *(a14 + 21);
+	      b[348] = *(a15 + 20);
+	      b[349] = *(a15 + 21);
+	      b[350] = *(a16 + 20);
+	      b[351] = *(a16 + 21);
+
+		  b[352] = ZERO;
+		  b[353] = ZERO;
+	      b[354] = ZERO;
+	      b[355] = ZERO;
+	      b[356] = ZERO;
+	      b[357] = ZERO;
+	      b[358] = ZERO;
+	      b[359] = ZERO;
+	      b[360] = ZERO;
+	      b[361] = ZERO;
+	      b[362] = ZERO;
+	      b[363] = ZERO;
+	      b[364] = ZERO;
+	      b[365] = ZERO;
+	      b[366] = ZERO;
+	      b[367] = ZERO;
+	      b[368] = ZERO;
+	      b[369] = ZERO;
+	      b[370] = ZERO;
+	      b[371] = ZERO;
+	      b[372] = ZERO;
+	      b[373] = ZERO;
+#ifdef UNIT
+	      b[374] = ONE;
+	      b[375] = ZERO;
+#else
+	      b[374] = *(a12 + 22);
+	      b[375] = *(a12 + 23);
+#endif
+		  b[376] = *(a13 + 22);
+	      b[377] = *(a13 + 23);
+	      b[378] = *(a14 + 22);
+	      b[379] = *(a14 + 23);
+		  b[380] = *(a15 + 22);
+	      b[381] = *(a15 + 23);
+	      b[382] = *(a16 + 22);
+	      b[383] = *(a16 + 23);
+
+		  b[384] = ZERO;
+		  b[385] = ZERO;
+	      b[386] = ZERO;
+	      b[387] = ZERO;
+	      b[388] = ZERO;
+	      b[389] = ZERO;
+	      b[390] = ZERO;
+	      b[391] = ZERO;
+	      b[392] = ZERO;
+	      b[393] = ZERO;
+	      b[394] = ZERO;
+	      b[395] = ZERO;
+	      b[396] = ZERO;
+	      b[397] = ZERO;
+	      b[398] = ZERO;
+	      b[399] = ZERO;
+	      b[400] = ZERO;
+	      b[401] = ZERO;
+	      b[402] = ZERO;
+	      b[403] = ZERO;
+	      b[404] = ZERO;
+	      b[405] = ZERO;
+	      b[406] = ZERO;
+	      b[407] = ZERO;
+#ifdef UNIT
+	      b[408] = ONE;
+	      b[409] = ZERO;
+#else
+	      b[408] = *(a13 + 24);
+	      b[409] = *(a13 + 25);
+#endif
+		  b[410] = *(a14 + 24);
+	      b[411] = *(a14 + 25);
+	      b[412] = *(a15 + 24);
+		  b[413] = *(a15 + 25);
+	      b[414] = *(a16 + 24);
+	      b[415] = *(a16 + 25);
+
+		  b[416] = ZERO;
+		  b[417] = ZERO;
+	      b[418] = ZERO;
+	      b[419] = ZERO;
+	      b[420] = ZERO;
+	      b[421] = ZERO;
+	      b[422] = ZERO;
+	      b[423] = ZERO;
+	      b[424] = ZERO;
+	      b[425] = ZERO;
+	      b[426] = ZERO;
+	      b[427] = ZERO;
+	      b[428] = ZERO;
+	      b[429] = ZERO;
+	      b[430] = ZERO;
+	      b[431] = ZERO;
+	      b[432] = ZERO;
+	      b[433] = ZERO;
+	      b[434] = ZERO;
+	      b[435] = ZERO;
+	      b[436] = ZERO;
+	      b[437] = ZERO;
+	      b[438] = ZERO;
+	      b[439] = ZERO;
+	      b[440] = ZERO;
+	      b[441] = ZERO;
+#ifdef UNIT
+	      b[442] = ONE;
+	      b[443] = ZERO;
+#else
+	      b[442] = *(a14 + 26);
+	      b[443] = *(a14 + 27);
+#endif
+		  b[444] = *(a15 + 26);
+	      b[445] = *(a15 + 27);
+		  b[446] = *(a16 + 26);
+	      b[447] = *(a16 + 27);
+
+		  b[448] = ZERO;
+		  b[449] = ZERO;
+	      b[450] = ZERO;
+	      b[451] = ZERO;
+	      b[452] = ZERO;
+	      b[453] = ZERO;
+	      b[454] = ZERO;
+	      b[455] = ZERO;
+	      b[456] = ZERO;
+	      b[457] = ZERO;
+	      b[458] = ZERO;
+	      b[459] = ZERO;
+	      b[460] = ZERO;
+	      b[461] = ZERO;
+	      b[462] = ZERO;
+	      b[463] = ZERO;
+	      b[464] = ZERO;
+	      b[465] = ZERO;
+	      b[466] = ZERO;
+	      b[467] = ZERO;
+	      b[468] = ZERO;
+	      b[469] = ZERO;
+	      b[470] = ZERO;
+	      b[471] = ZERO;
+	      b[472] = ZERO;
+	      b[473] = ZERO;
+	      b[474] = ZERO;
+	      b[475] = ZERO;
+#ifdef UNIT
+	      b[476] = ONE;
+	      b[477] = ZERO;
+#else
+	      b[476] = *(a15 + 28);
+	      b[477] = *(a15 + 29);
+#endif
+		  b[478] = *(a16 + 28);
+		  b[479] = *(a16 + 29);
+
+		  b[480] = ZERO;
+		  b[481] = ZERO;
+	      b[482] = ZERO;
+	      b[483] = ZERO;
+	      b[484] = ZERO;
+	      b[485] = ZERO;
+	      b[486] = ZERO;
+	      b[487] = ZERO;
+	      b[488] = ZERO;
+	      b[489] = ZERO;
+	      b[490] = ZERO;
+	      b[491] = ZERO;
+	      b[492] = ZERO;
+	      b[493] = ZERO;
+	      b[494] = ZERO;
+	      b[495] = ZERO;
+	      b[496] = ZERO;
+	      b[497] = ZERO;
+	      b[498] = ZERO;
+	      b[499] = ZERO;
+	      b[500] = ZERO;
+	      b[501] = ZERO;
+	      b[502] = ZERO;
+	      b[503] = ZERO;
+	      b[504] = ZERO;
+	      b[505] = ZERO;
+	      b[506] = ZERO;
+	      b[507] = ZERO;
+	      b[508] = ZERO;
+	      b[509] = ZERO;
+#ifdef UNIT
+	      b[510] = ONE;
+	      b[511] = ZERO;
+#else
+	      b[510] = *(a16 + 30);
+	      b[511] = *(a16 + 31);
+#endif
+
+	      a01 += 16 * lda;
+	      a02 += 16 * lda;
+	      a03 += 16 * lda;
+	      a04 += 16 * lda;
+	      a05 += 16 * lda;
+	      a06 += 16 * lda;
+	      a07 += 16 * lda;
+	      a08 += 16 * lda;
+	      a09 += 16 * lda;
+	      a10 += 16 * lda;
+	      a11 += 16 * lda;
+	      a12 += 16 * lda;
+	      a13 += 16 * lda;
+	      a14 += 16 * lda;
+	      a15 += 16 * lda;
+	      a16 += 16 * lda;
+	      b += 512;
+	    }
+
+	  X += 16;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 15);
+      if (i) {
+
+	if (X < posY) {
+
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+		b[  2] = *(a02 +  0);
+		b[  3] = *(a02 +  1);
+		b[  4] = *(a03 +  0);
+		b[  5] = *(a03 +  1);
+		b[  6] = *(a04 +  0);
+		b[  7] = *(a04 +  1);
+		b[  8] = *(a05 +  0);
+		b[  9] = *(a05 +  1);
+		b[ 10] = *(a06 +  0);
+		b[ 11] = *(a06 +  1);
+		b[ 12] = *(a07 +  0);
+		b[ 13] = *(a07 +  1);
+		b[ 14] = *(a08 +  0);
+		b[ 15] = *(a08 +  1);
+
+		b[ 16] = *(a09 +  0);
+		b[ 17] = *(a09 +  1);
+		b[ 18] = *(a10 +  0);
+		b[ 19] = *(a10 +  1);
+		b[ 20] = *(a11 +  0);
+		b[ 21] = *(a11 +  1);
+		b[ 22] = *(a12 +  0);
+		b[ 23] = *(a12 +  1);
+		b[ 24] = *(a13 +  0);
+		b[ 25] = *(a13 +  1);
+		b[ 26] = *(a14 +  0);
+		b[ 27] = *(a14 +  1);
+		b[ 28] = *(a15 +  0);
+		b[ 29] = *(a15 +  1);
+		b[ 30] = *(a16 +  0);
+		b[ 31] = *(a16 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    a05 += 2;
+	    a06 += 2;
+	    a07 += 2;
+	    a08 += 2;
+		a09 += 2;
+	    a10 += 2;
+	    a11 += 2;
+	    a12 += 2;
+	    a13 += 2;
+	    a14 += 2;
+	    a15 += 2;
+	    a16 += 2;
+	    b += 32;
+	  }
+	} else
+	  if (X > posY) {
+	    /* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda;
+	      a09 += i * lda;
+	      a10 += i * lda;
+	      a11 += i * lda;
+	      a12 += i * lda;
+	      a13 += i * lda;
+	      a14 += i * lda;
+	      a15 += i * lda;
+	      a16 += i * lda; */
+	    b += 32 * i;
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+		  b[ 16] = *(a09 +  0);
+	      b[ 17] = *(a09 +  1);
+	      b[ 18] = *(a10 +  0);
+	      b[ 19] = *(a10 +  1);
+	      b[ 20] = *(a11 +  0);
+	      b[ 21] = *(a11 +  1);
+	      b[ 22] = *(a12 +  0);
+	      b[ 23] = *(a12 +  1);
+	      b[ 24] = *(a13 +  0);
+	      b[ 25] = *(a13 +  1);
+	      b[ 26] = *(a14 +  0);
+	      b[ 27] = *(a14 +  1);
+	      b[ 28] = *(a15 +  0);
+	      b[ 29] = *(a15 +  1);
+		  b[ 30] = *(a16 +  0);
+	      b[ 31] = *(a16 +  1);
+	      b += 32;
+
+	      if (i >= 2) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+		b[  4] = *(a03 +  2);
+		b[  5] = *(a03 +  3);
+		b[  6] = *(a04 +  2);
+		b[  7] = *(a04 +  3);
+		b[  8] = *(a05 +  2);
+		b[  9] = *(a05 +  3);
+		b[ 10] = *(a06 +  2);
+		b[ 11] = *(a06 +  3);
+		b[ 12] = *(a07 +  2);
+		b[ 13] = *(a07 +  3);
+		b[ 14] = *(a08 +  2);
+		b[ 15] = *(a08 +  3);
+		b[ 16] = *(a09 +  2);
+		b[ 17] = *(a09 +  3);
+		b[ 18] = *(a10 +  2);
+		b[ 19] = *(a10 +  3);
+		b[ 20] = *(a11 +  2);
+		b[ 21] = *(a11 +  3);
+		b[ 22] = *(a12 +  2);
+		b[ 23] = *(a12 +  3);
+		b[ 24] = *(a13 +  2);
+		b[ 25] = *(a13 +  3);
+		b[ 26] = *(a14 +  2);
+		b[ 27] = *(a14 +  3);
+		b[ 28] = *(a15 +  2);
+		b[ 29] = *(a15 +  3);
+		b[ 30] = *(a16 +  2);
+		b[ 31] = *(a16 +  3);
+		b += 32;
+	      }
+
+	      if (i >= 3) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[  6] = *(a04 +  4);
+		b[  7] = *(a04 +  5);
+		b[  8] = *(a05 +  4);
+		b[  9] = *(a05 +  5);
+		b[ 10] = *(a06 +  4);
+		b[ 11] = *(a06 +  5);
+		b[ 12] = *(a07 +  4);
+		b[ 13] = *(a07 +  5);
+		b[ 14] = *(a08 +  4);
+		b[ 15] = *(a08 +  5);
+		b[ 16] = *(a09 +  4);
+		b[ 17] = *(a09 +  5);
+		b[ 18] = *(a10 +  4);
+		b[ 19] = *(a10 +  5);
+		b[ 20] = *(a11 +  4);
+		b[ 21] = *(a11 +  5);
+		b[ 22] = *(a12 +  4);
+		b[ 23] = *(a12 +  5);
+		b[ 24] = *(a13 +  4);
+		b[ 25] = *(a13 +  5);
+		b[ 26] = *(a14 +  4);
+		b[ 27] = *(a14 +  5);
+		b[ 28] = *(a15 +  4);
+		b[ 29] = *(a15 +  5);
+		b[ 30] = *(a16 +  4);
+		b[ 31] = *(a16 +  5);
+		b += 32;
+	      }
+
+	      if (i >= 4) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[  8] = *(a05 +  6);
+		b[  9] = *(a05 +  7);
+		b[ 10] = *(a06 +  6);
+		b[ 11] = *(a06 +  7);
+		b[ 12] = *(a07 +  6);
+		b[ 13] = *(a07 +  7);
+		b[ 14] = *(a08 +  6);
+		b[ 15] = *(a08 +  7);
+		b[ 16] = *(a09 +  6);
+		b[ 17] = *(a09 +  7);
+		b[ 18] = *(a10 +  6);
+		b[ 19] = *(a10 +  7);
+		b[ 20] = *(a11 +  6);
+		b[ 21] = *(a11 +  7);
+		b[ 22] = *(a12 +  6);
+		b[ 23] = *(a12 +  7);
+		b[ 24] = *(a13 +  6);
+		b[ 25] = *(a13 +  7);
+		b[ 26] = *(a14 +  6);
+		b[ 27] = *(a14 +  7);
+		b[ 28] = *(a15 +  6);
+		b[ 29] = *(a15 +  7);
+		b[ 30] = *(a16 +  6);
+		b[ 31] = *(a16 +  7);
+		b += 32;
+	      }
+
+	      if (i >= 5) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[ 10] = *(a06 +  8);
+		b[ 11] = *(a06 +  9);
+		b[ 12] = *(a07 +  8);
+		b[ 13] = *(a07 +  9);
+		b[ 14] = *(a08 +  8);
+		b[ 15] = *(a08 +  9);
+		b[ 16] = *(a09 +  8);
+		b[ 17] = *(a09 +  9);
+		b[ 18] = *(a10 +  8);
+		b[ 19] = *(a10 +  9);
+		b[ 20] = *(a11 +  8);
+		b[ 21] = *(a11 +  9);
+		b[ 22] = *(a12 +  8);
+		b[ 23] = *(a12 +  9);
+		b[ 24] = *(a13 +  8);
+		b[ 25] = *(a13 +  9);
+		b[ 26] = *(a14 +  8);
+		b[ 27] = *(a14 +  9);
+		b[ 28] = *(a15 +  8);
+		b[ 29] = *(a15 +  9);
+		b[ 30] = *(a16 +  8);
+		b[ 31] = *(a16 +  9);
+		b += 32;
+	      }
+
+	      if (i >= 6) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[ 12] = *(a07 + 10);
+		b[ 13] = *(a07 + 11);
+		b[ 14] = *(a08 + 10);
+		b[ 15] = *(a08 + 11);
+		b[ 16] = *(a09 + 10);
+		b[ 17] = *(a09 + 11);
+		b[ 18] = *(a10 + 10);
+		b[ 19] = *(a10 + 11);
+		b[ 20] = *(a11 + 10);
+		b[ 21] = *(a11 + 11);
+		b[ 22] = *(a12 + 10);
+		b[ 23] = *(a12 + 11);
+		b[ 24] = *(a13 + 10);
+		b[ 25] = *(a13 + 11);
+		b[ 26] = *(a14 + 10);
+		b[ 27] = *(a14 + 11);
+		b[ 28] = *(a15 + 10);
+		b[ 29] = *(a15 + 11);
+		b[ 30] = *(a16 + 10);
+		b[ 31] = *(a16 + 11);
+		b += 32;
+	      }
+
+	      if (i >= 7) {
+		b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[ 14] = *(a08 + 12);
+		b[ 15] = *(a08 + 13);
+		b[ 16] = *(a09 + 12);
+		b[ 17] = *(a09 + 13);
+		b[ 18] = *(a10 + 12);
+		b[ 19] = *(a10 + 13);
+		b[ 20] = *(a11 + 12);
+		b[ 21] = *(a11 + 13);
+		b[ 22] = *(a12 + 12);
+		b[ 23] = *(a12 + 13);
+		b[ 24] = *(a13 + 12);
+		b[ 25] = *(a13 + 13);
+		b[ 26] = *(a14 + 12);
+		b[ 27] = *(a14 + 13);
+		b[ 28] = *(a15 + 12);
+		b[ 29] = *(a15 + 13);
+		b[ 30] = *(a16 + 12);
+		b[ 31] = *(a16 + 13);
+		b += 32;
+	    }
+
+		if (i >= 8) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+#ifdef UNIT
+	      b[ 14] = ONE;
+	      b[ 15] = ZERO;
+#else
+	      b[ 14] = *(a08 +  14);
+	      b[ 15] = *(a08 +  15);
+#endif
+		b[ 16] = *(a09 + 14);
+		b[ 17] = *(a09 + 15);
+		b[ 18] = *(a10 + 14);
+		b[ 19] = *(a10 + 15);
+		b[ 20] = *(a11 + 14);
+		b[ 21] = *(a11 + 15);
+		b[ 22] = *(a12 + 14);
+		b[ 23] = *(a12 + 15);
+		b[ 24] = *(a13 + 14);
+		b[ 25] = *(a13 + 15);
+		b[ 26] = *(a14 + 14);
+		b[ 27] = *(a14 + 15);
+		b[ 28] = *(a15 + 14);
+		b[ 29] = *(a15 + 15);
+		b[ 30] = *(a16 + 14);
+		b[ 31] = *(a16 + 15);
+	      b += 32;
+	    }
+
+		if (i >= 9) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+#ifdef UNIT
+	      b[ 16] = ONE;
+	      b[ 17] = ZERO;
+#else
+	      b[ 16] = *(a09 + 16);
+	      b[ 17] = *(a09 + 17);
+#endif
+	    b[ 18] = *(a10 + 16);
+		b[ 19] = *(a10 + 17);
+		b[ 20] = *(a11 + 16);
+		b[ 21] = *(a11 + 17);
+		b[ 22] = *(a12 + 16);
+		b[ 23] = *(a12 + 17);
+		b[ 24] = *(a13 + 16);
+		b[ 25] = *(a13 + 17);
+		b[ 26] = *(a14 + 16);
+		b[ 27] = *(a14 + 17);
+		b[ 28] = *(a15 + 16);
+		b[ 29] = *(a15 + 17);
+		b[ 30] = *(a16 + 16);
+		b[ 31] = *(a16 + 17);
+	      b += 32;
+	    }
+
+		if (i >= 10) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a10 + 18);
+	      b[ 19] = *(a10 + 19);
+#endif
+	    b[ 20] = *(a11 + 18);
+		b[ 21] = *(a11 + 19);
+		b[ 22] = *(a12 + 18);
+		b[ 23] = *(a12 + 19);
+		b[ 24] = *(a13 + 18);
+		b[ 25] = *(a13 + 19);
+		b[ 26] = *(a14 + 18);
+		b[ 27] = *(a14 + 19);
+		b[ 28] = *(a15 + 18);
+		b[ 29] = *(a15 + 19);
+		b[ 30] = *(a16 + 18);
+		b[ 31] = *(a16 + 19);
+	      b += 32;
+	    }
+
+		if (i >= 11) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a11 + 20);
+	      b[ 21] = *(a11 + 21);
+#endif
+	    b[ 22] = *(a12 + 20);
+		b[ 23] = *(a12 + 21);
+		b[ 24] = *(a13 + 20);
+		b[ 25] = *(a13 + 21);
+		b[ 26] = *(a14 + 20);
+		b[ 27] = *(a14 + 21);
+		b[ 28] = *(a15 + 20);
+		b[ 29] = *(a15 + 21);
+		b[ 30] = *(a16 + 20);
+		b[ 31] = *(a16 + 21);
+	      b += 32;
+	    }
+
+		if (i >= 12) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+	    b[ 20] = ZERO;
+	    b[ 21] = ZERO;
+#ifdef UNIT
+	      b[ 22] = ONE;
+	      b[ 23] = ZERO;
+#else
+	      b[ 22] = *(a12 + 22);
+	      b[ 23] = *(a12 + 23);
+#endif
+	    b[ 24] = *(a13 + 22);
+		b[ 25] = *(a13 + 23);
+		b[ 26] = *(a14 + 22);
+		b[ 27] = *(a14 + 23);
+		b[ 28] = *(a15 + 22);
+		b[ 29] = *(a15 + 23);
+		b[ 30] = *(a16 + 22);
+		b[ 31] = *(a16 + 23);
+	      b += 32;
+	    }
+
+		if (i >= 13) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+	    b[ 20] = ZERO;
+	    b[ 21] = ZERO;
+	    b[ 22] = ZERO;
+	    b[ 23] = ZERO;
+#ifdef UNIT
+	      b[ 24] = ONE;
+	      b[ 25] = ZERO;
+#else
+	      b[ 24] = *(a13 + 24);
+	      b[ 25] = *(a13 + 25);
+#endif
+	    b[ 26] = *(a14 + 24);
+		b[ 27] = *(a14 + 25);
+		b[ 28] = *(a15 + 24);
+		b[ 29] = *(a15 + 25);
+		b[ 30] = *(a16 + 24);
+		b[ 31] = *(a16 + 25);
+	      b += 32;
+	    }
+
+		if (i >= 14) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+	    b[ 20] = ZERO;
+	    b[ 21] = ZERO;
+	    b[ 22] = ZERO;
+	    b[ 23] = ZERO;
+	    b[ 24] = ZERO;
+	    b[ 25] = ZERO;
+#ifdef UNIT
+	      b[ 26] = ONE;
+	      b[ 27] = ZERO;
+#else
+	      b[ 26] = *(a14 + 26);
+	      b[ 27] = *(a14 + 27);
+#endif
+	    b[ 28] = *(a15 + 26);
+		b[ 29] = *(a15 + 27);
+		b[ 30] = *(a16 + 26);
+		b[ 31] = *(a16 + 27);
+	      b += 32;
+	    }
+
+		if (i >= 15) {
+	    b[ 0] = ZERO;
+		b[ 1] = ZERO;
+		b[ 2] = ZERO;
+		b[ 3] = ZERO;
+		b[ 4] = ZERO;
+		b[ 5] = ZERO;
+		b[ 6] = ZERO;
+		b[ 7] = ZERO;
+		b[ 8] = ZERO;
+		b[ 9] = ZERO;
+		b[10] = ZERO;
+		b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+	    b[ 16] = ZERO;
+	    b[ 17] = ZERO;
+	    b[ 18] = ZERO;
+	    b[ 19] = ZERO;
+	    b[ 20] = ZERO;
+	    b[ 21] = ZERO;
+	    b[ 22] = ZERO;
+	    b[ 23] = ZERO;
+	    b[ 24] = ZERO;
+	    b[ 25] = ZERO;
+	    b[ 26] = ZERO;
+	    b[ 27] = ZERO;
+#ifdef UNIT
+	      b[ 28] = ONE;
+	      b[ 29] = ZERO;
+#else
+	      b[ 28] = *(a15 + 28);
+	      b[ 29] = *(a15 + 29);
+#endif
+	    b[ 30] = *(a16 + 28);
+		b[ 31] = *(a16 + 29);
+	      b += 32;
+	    }
+	  }
+      }
+
+      posY += 16;
+      js --;
+    } while (js > 0);
+  } /* End of main loop */
+
+
+  if (n & 8){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+      }
+
+      i = (m >> 3);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	    for (ii = 0; ii < 8; ii++){
+
+	    b[  0] = *(a01 +  0);
+		b[  1] = *(a01 +  1);
+		b[  2] = *(a02 +  0);
+		b[  3] = *(a02 +  1);
+		b[  4] = *(a03 +  0);
+		b[  5] = *(a03 +  1);
+		b[  6] = *(a04 +  0);
+		b[  7] = *(a04 +  1);
+
+		b[  8] = *(a05 +  0);
+		b[  9] = *(a05 +  1);
+		b[ 10] = *(a06 +  0);
+		b[ 11] = *(a06 +  1);
+		b[ 12] = *(a07 +  0);
+		b[ 13] = *(a07 +  1);
+		b[ 14] = *(a08 +  0);
+		b[ 15] = *(a08 +  1);
+
+		a01 += 2;
+		a02 += 2;
+		a03 += 2;
+		a04 += 2;
+		a05 += 2;
+		a06 += 2;
+		a07 += 2;
+		a08 += 2;
+		b += 16;
+	    }
+	  } else
+	    if (X > posY) {
+	      a01 += 8 * lda;
+	      a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+
+	      b += 128;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = *(a05 +  0);
+	      b[  9] = *(a05 +  1);
+	      b[ 10] = *(a06 +  0);
+	      b[ 11] = *(a06 +  1);
+	      b[ 12] = *(a07 +  0);
+	      b[ 13] = *(a07 +  1);
+	      b[ 14] = *(a08 +  0);
+	      b[ 15] = *(a08 +  1);
+
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a02 +  2);
+	      b[ 19] = *(a02 +  3);
+#endif
+	      b[ 20] = *(a03 +  2);
+	      b[ 21] = *(a03 +  3);
+	      b[ 22] = *(a04 +  2);
+	      b[ 23] = *(a04 +  3);
+	      b[ 24] = *(a05 +  2);
+	      b[ 25] = *(a05 +  3);
+	      b[ 26] = *(a06 +  2);
+	      b[ 27] = *(a06 +  3);
+	      b[ 28] = *(a07 +  2);
+	      b[ 29] = *(a07 +  3);
+	      b[ 30] = *(a08 +  2);
+	      b[ 31] = *(a08 +  3);
+
+	      b[ 32] = ZERO;
+	      b[ 33] = ZERO;
+	      b[ 34] = ZERO;
+	      b[ 35] = ZERO;
+#ifdef UNIT
+	      b[ 36] = ONE;
+	      b[ 37] = ZERO;
+#else
+	      b[ 36] = *(a03 +  4);
+	      b[ 37] = *(a03 +  5);
+#endif
+	      b[ 38] = *(a04 +  4);
+	      b[ 39] = *(a04 +  5);
+	      b[ 40] = *(a05 +  4);
+	      b[ 41] = *(a05 +  5);
+	      b[ 42] = *(a06 +  4);
+	      b[ 43] = *(a06 +  5);
+	      b[ 44] = *(a07 +  4);
+	      b[ 45] = *(a07 +  5);
+	      b[ 46] = *(a08 +  4);
+	      b[ 47] = *(a08 +  5);
+
+	      b[ 48] = ZERO;
+	      b[ 49] = ZERO;
+	      b[ 50] = ZERO;
+	      b[ 51] = ZERO;
+	      b[ 52] = ZERO;
+	      b[ 53] = ZERO;
+#ifdef UNIT
+	      b[ 54] = ONE;
+	      b[ 55] = ZERO;
+#else
+	      b[ 54] = *(a04 +  6);
+	      b[ 55] = *(a04 +  7);
+#endif
+	      b[ 56] = *(a05 +  6);
+	      b[ 57] = *(a05 +  7);
+	      b[ 58] = *(a06 +  6);
+	      b[ 59] = *(a06 +  7);
+	      b[ 60] = *(a07 +  6);
+	      b[ 61] = *(a07 +  7);
+	      b[ 62] = *(a08 +  6);
+	      b[ 63] = *(a08 +  7);
+
+	      b[ 64] = ZERO;
+	      b[ 65] = ZERO;
+	      b[ 66] = ZERO;
+	      b[ 67] = ZERO;
+	      b[ 68] = ZERO;
+	      b[ 69] = ZERO;
+	      b[ 70] = ZERO;
+	      b[ 71] = ZERO;
+#ifdef UNIT
+	      b[ 72] = ONE;
+	      b[ 73] = ZERO;
+#else
+	      b[ 72] = *(a05 +  8);
+	      b[ 73] = *(a05 +  9);
+#endif
+	      b[ 74] = *(a06 +  8);
+	      b[ 75] = *(a06 +  9);
+	      b[ 76] = *(a07 +  8);
+	      b[ 77] = *(a07 +  9);
+	      b[ 78] = *(a08 +  8);
+	      b[ 79] = *(a08 +  9);
+
+	      b[ 80] = ZERO;
+	      b[ 81] = ZERO;
+	      b[ 82] = ZERO;
+	      b[ 83] = ZERO;
+	      b[ 84] = ZERO;
+	      b[ 85] = ZERO;
+	      b[ 86] = ZERO;
+	      b[ 87] = ZERO;
+	      b[ 88] = ZERO;
+	      b[ 89] = ZERO;
+#ifdef UNIT
+	      b[ 90] = ONE;
+	      b[ 91] = ZERO;
+#else
+	      b[ 90] = *(a06 + 10);
+	      b[ 91] = *(a06 + 11);
+#endif
+	      b[ 92] = *(a07 + 10);
+	      b[ 93] = *(a07 + 11);
+	      b[ 94] = *(a08 + 10);
+	      b[ 95] = *(a08 + 11);
+
+	      b[ 96] = ZERO;
+	      b[ 97] = ZERO;
+	      b[ 98] = ZERO;
+	      b[ 99] = ZERO;
+	      b[100] = ZERO;
+	      b[101] = ZERO;
+	      b[102] = ZERO;
+	      b[103] = ZERO;
+	      b[104] = ZERO;
+	      b[105] = ZERO;
+	      b[106] = ZERO;
+	      b[107] = ZERO;
+#ifdef UNIT
+	      b[108] = ONE;
+	      b[109] = ZERO;
+#else
+	      b[108] = *(a07 + 12);
+	      b[109] = *(a07 + 13);
+#endif
+	      b[110] = *(a08 + 12);
+	      b[111] = *(a08 + 13);
+
+	      b[112] = ZERO;
+	      b[113] = ZERO;
+	      b[114] = ZERO;
+	      b[115] = ZERO;
+	      b[116] = ZERO;
+	      b[117] = ZERO;
+	      b[118] = ZERO;
+	      b[119] = ZERO;
+	      b[120] = ZERO;
+	      b[121] = ZERO;
+	      b[122] = ZERO;
+	      b[123] = ZERO;
+	      b[124] = ZERO;
+	      b[125] = ZERO;
+#ifdef UNIT
+	      b[126] = ONE;
+	      b[127] = ZERO;
+#else
+	      b[126] = *(a08 + 14);
+	      b[127] = *(a08 + 15);
+#endif
+
+	      a01 += 8 * lda;
+	      a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+	      b += 128;
+	    }
+
+	  X += 8;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 7);
+      if (i) {
+
+	if (X < posY) {
+	  for (ii = 0; ii < i; ii++){
+
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a03 +  0);
+	    b[  5] = *(a03 +  1);
+	    b[  6] = *(a04 +  0);
+	    b[  7] = *(a04 +  1);
+
+	    b[  8] = *(a05 +  0);
+	    b[  9] = *(a05 +  1);
+	    b[ 10] = *(a06 +  0);
+	    b[ 11] = *(a06 +  1);
+	    b[ 12] = *(a07 +  0);
+	    b[ 13] = *(a07 +  1);
+	    b[ 14] = *(a08 +  0);
+	    b[ 15] = *(a08 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    a05 += 2;
+	    a06 += 2;
+	    a07 += 2;
+	    a08 += 2;
+	    b += 16;
+	    }
+	} else
+	  if (X > posY) {
+	      /* a01 += i * lda;
+	      a02 += i * lda;
+	      a03 += i * lda;
+	      a04 += i * lda;
+	      a05 += i * lda;
+	      a06 += i * lda;
+	      a07 += i * lda;
+	      a08 += i * lda; */
+	      b += 16 * i;
+	  } else {
+#ifdef UNIT
+	    b[ 0] = ONE;
+	    b[ 1] = ZERO;
+#else
+	    b[ 0] = *(a01 +  0);
+	    b[ 1] = *(a01 +  1);
+#endif
+	    b[ 2] = *(a02 +  0);
+	    b[ 3] = *(a02 +  1);
+	    b[ 4] = *(a03 +  0);
+	    b[ 5] = *(a03 +  1);
+	    b[ 6] = *(a04 +  0);
+	    b[ 7] = *(a04 +  1);
+	    b[ 8] = *(a05 +  0);
+	    b[ 9] = *(a05 +  1);
+	    b[10] = *(a06 +  0);
+	    b[11] = *(a06 +  1);
+	    b[12] = *(a07 +  0);
+	    b[13] = *(a07 +  1);
+	    b[14] = *(a08 +  0);
+	    b[15] = *(a08 +  1);
+	    b += 16;
+
+	    if(i >= 2) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = *(a03 +  2);
+	      b[ 5] = *(a03 +  3);
+	      b[ 6] = *(a04 +  2);
+	      b[ 7] = *(a04 +  3);
+	      b[ 8] = *(a05 +  2);
+	      b[ 9] = *(a05 +  3);
+	      b[10] = *(a06 +  2);
+	      b[11] = *(a06 +  3);
+	      b[12] = *(a07 +  2);
+	      b[13] = *(a07 +  3);
+	      b[14] = *(a08 +  2);
+	      b[15] = *(a08 +  3);
+	      b += 16;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = *(a04 +  4);
+	      b[ 7] = *(a04 +  5);
+	      b[ 8] = *(a05 +  4);
+	      b[ 9] = *(a05 +  5);
+	      b[10] = *(a06 +  4);
+	      b[11] = *(a06 +  5);
+	      b[12] = *(a07 +  4);
+	      b[13] = *(a07 +  5);
+	      b[14] = *(a08 +  4);
+	      b[15] = *(a08 +  5);
+	      b += 16;
+	    }
+
+	    if (i >= 4) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+#ifdef UNIT
+	      b[ 6] = ONE;
+	      b[ 7] = ZERO;
+#else
+	      b[ 6] = *(a04 +  6);
+	      b[ 7] = *(a04 +  7);
+#endif
+	      b[ 8] = *(a05 +  6);
+	      b[ 9] = *(a05 +  7);
+	      b[10] = *(a06 +  6);
+	      b[11] = *(a06 +  7);
+	      b[12] = *(a07 +  6);
+	      b[13] = *(a07 +  7);
+	      b[14] = *(a08 +  6);
+	      b[15] = *(a08 +  7);
+	      b += 16;
+	    }
+
+	    if (i >= 5) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+#ifdef UNIT
+	      b[ 8] = ONE;
+	      b[ 9] = ZERO;
+#else
+	      b[ 8] = *(a05 +  8);
+	      b[ 9] = *(a05 +  9);
+#endif
+	      b[10] = *(a06 +  8);
+	      b[11] = *(a06 +  9);
+	      b[12] = *(a07 +  8);
+	      b[13] = *(a07 +  9);
+	      b[14] = *(a08 +  8);
+	      b[15] = *(a08 +  9);
+	      b += 16;
+	    }
+
+	    if (i >= 6) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+#ifdef UNIT
+	      b[10] = ONE;
+	      b[11] = ZERO;
+#else
+	      b[10] = *(a06 + 10);
+	      b[11] = *(a06 + 11);
+#endif
+	      b[12] = *(a07 + 10);
+	      b[13] = *(a07 + 11);
+	      b[14] = *(a08 + 10);
+	      b[15] = *(a08 + 11);
+	      b += 16;
+	    }
+
+	    if (i >= 7) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+#ifdef UNIT
+	      b[12] = ONE;
+	      b[13] = ZERO;
+#else
+	      b[12] = *(a07 + 12);
+	      b[13] = *(a07 + 13);
+#endif
+	      b[14] = *(a08 + 12);
+	      b[15] = *(a08 + 13);
+	      b += 16;
+	    }
+	  }
+      }
+
+      posY += 8;
+  }
+
+
+  if (n & 4){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+      }
+
+      i = (m >> 2);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	      for (ii = 0; ii < 4; ii++){
+
+		b[  0] = *(a01 +  0);
+		b[  1] = *(a01 +  1);
+		b[  2] = *(a02 +  0);
+		b[  3] = *(a02 +  1);
+		b[  4] = *(a03 +  0);
+		b[  5] = *(a03 +  1);
+		b[  6] = *(a04 +  0);
+		b[  7] = *(a04 +  1);
+
+		a01 += 2;
+		a02 += 2;
+		a03 += 2;
+		a04 += 2;
+		b += 8;
+	      }
+	  } else
+	    if (X > posY) {
+	      a01 += 4 * lda;
+	      a02 += 4 * lda;
+	      a03 += 4 * lda;
+	      a04 += 4 * lda;
+	      b += 32;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+	      b[  4] = *(a03 +  0);
+	      b[  5] = *(a03 +  1);
+	      b[  6] = *(a04 +  0);
+	      b[  7] = *(a04 +  1);
+
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+#ifdef UNIT
+	      b[ 10] = ONE;
+	      b[ 11] = ZERO;
+#else
+	      b[ 10] = *(a02 +  2);
+	      b[ 11] = *(a02 +  3);
+#endif
+	      b[ 12] = *(a03 +  2);
+	      b[ 13] = *(a03 +  3);
+	      b[ 14] = *(a04 +  2);
+	      b[ 15] = *(a04 +  3);
+
+	      b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a03 +  4);
+	      b[ 21] = *(a03 +  5);
+#endif
+	      b[ 22] = *(a04 +  4);
+	      b[ 23] = *(a04 +  5);
+
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+#ifdef UNIT
+	      b[ 30] = ONE;
+	      b[ 31] = ZERO;
+#else
+	      b[ 30] = *(a04 +  6);
+	      b[ 31] = *(a04 +  7);
+#endif
+
+	      a01 += 4 * lda;
+	      a02 += 4 * lda;
+	      a03 += 4 * lda;
+	      a04 += 4 * lda;
+
+	      b += 32;
+	    }
+
+	  X += 4;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 3);
+      if (i) {
+
+	if (X < posY) {
+
+	  for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a03 +  0);
+	    b[  5] = *(a03 +  1);
+	    b[  6] = *(a04 +  0);
+	    b[  7] = *(a04 +  1);
+
+	    a01 += 2;
+	    a02 += 2;
+	    a03 += 2;
+	    a04 += 2;
+	    b += 8;
+	  }
+	} else
+	  if (X > posY) {
+	    /* a01 += i * lda;
+	    a02 += i * lda;
+	    a03 += i * lda;
+	    a04 += i * lda; */
+	    b += 8 * i;
+	  } else {
+#ifdef UNIT
+	    b[ 0] = ONE;
+	    b[ 1] = ZERO;
+#else
+	    b[ 0] = *(a01 +  0);
+	    b[ 1] = *(a01 +  1);
+#endif
+	    b[ 2] = *(a02 +  0);
+	    b[ 3] = *(a02 +  1);
+	    b[ 4] = *(a03 +  0);
+	    b[ 5] = *(a03 +  1);
+	    b[ 6] = *(a04 +  0);
+	    b[ 7] = *(a04 +  1);
+	    b += 8;
+
+	    if(i >= 2) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = *(a03 +  2);
+	      b[ 5] = *(a03 +  3);
+	      b[ 6] = *(a04 +  2);
+	      b[ 7] = *(a04 +  3);
+	      b += 8;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = ZERO;
+	      b[ 1] = ZERO;
+	      b[ 2] = ZERO;
+	      b[ 3] = ZERO;
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = *(a04 +  4);
+	      b[ 7] = *(a04 +  5);
+	      b += 8;
+	    }
+	  }
+      }
+
+      posY += 4;
+  }
+
+  if (n & 2){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+      }
+
+      i = (m >> 1);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+	    b[  4] = *(a01 +  2);
+	    b[  5] = *(a01 +  3);
+	    b[  6] = *(a02 +  2);
+	    b[  7] = *(a02 +  3);
+
+	    a01 += 4;
+	    a02 += 4;
+	    b += 8;
+	  } else
+	    if (X > posY) {
+	      a01 += 2 * lda;
+	      a02 += 2 * lda;
+	      b += 8;
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = *(a02 +  0);
+	      b[  3] = *(a02 +  1);
+
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+#ifdef UNIT
+	      b[  6] = ONE;
+	      b[  7] = ZERO;
+#else
+	      b[  6] = *(a02 +  2);
+	      b[  7] = *(a02 +  3);
+#endif
+
+	      a01 += 2 * lda;
+	      a02 += 2 * lda;
+	      b += 8;
+	    }
+
+	  X += 2;
+	  i --;
+	} while (i > 0);
+      }
+
+      if (m & 1) {
+
+	if (X < posY) {
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+	  b[  2] = *(a02 +  0);
+	  b[  3] = *(a02 +  1);
+	  /* a01 += 2;
+	  a02 += 2; */
+	  b += 4;
+	} else
+	  if (X > posY) {
+	    /* a01 += 2 * lda;
+	    a02 += 2 * lda; */
+	    b += 4;
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a02 +  0);
+	    b[  3] = *(a02 +  1);
+#endif
+	    b += 2;
+	  }
+      }
+      posY += 2;
+  }
+
+  if (n & 1){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+      }
+
+      i = m;
+      if (m > 0) {
+	do {
+	  if (X < posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    a01 += 2;
+	    b += 2;
+	  } else
+	    if (X > posY) {
+	      a01 += lda;
+	      b += 2;
+	    } else {
+#ifdef UNIT
+	      b[ 0] = ONE;
+	      b[ 1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      a01 += lda;
+	      b += 2;
+	    }
+
+	  X += 1;
+	  i --;
+	} while (i > 0);
+      }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrmm_utcopy_16.c b/kernel/generic/ztrmm_utcopy_16.c
new file mode 100644
index 000000000..5aba3727a
--- /dev/null
+++ b/kernel/generic/ztrmm_utcopy_16.c
@@ -0,0 +1,2318 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+  BLASLONG i, js;
+  BLASLONG X, ii;
+
+  FLOAT *a01, *a02, *a03, *a04, *a05, *a06, *a07, *a08;
+  FLOAT *a09, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  lda += lda;
+
+  js = (n >> 4);
+
+  if (js > 0){
+    do {
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY + 0) * lda;
+	a02 = a + posX * 2 + (posY + 1) * lda;
+	a03 = a + posX * 2 + (posY + 2) * lda;
+	a04 = a + posX * 2 + (posY + 3) * lda;
+	a05 = a + posX * 2 + (posY + 4) * lda;
+	a06 = a + posX * 2 + (posY + 5) * lda;
+	a07 = a + posX * 2 + (posY + 6) * lda;
+	a08 = a + posX * 2 + (posY + 7) * lda;
+	a09 = a + posX * 2 + (posY + 8) * lda;
+	a10 = a + posX * 2 + (posY + 9) * lda;
+	a11 = a + posX * 2 + (posY + 10) * lda;
+	a12 = a + posX * 2 + (posY + 11) * lda;
+	a13 = a + posX * 2 + (posY + 12) * lda;
+	a14 = a + posX * 2 + (posY + 13) * lda;
+	a15 = a + posX * 2 + (posY + 14) * lda;
+	a16 = a + posX * 2 + (posY + 15) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX + 0) * lda;
+	a02 = a + posY * 2 + (posX + 1) * lda;
+	a03 = a + posY * 2 + (posX + 2) * lda;
+	a04 = a + posY * 2 + (posX + 3) * lda;
+	a05 = a + posY * 2 + (posX + 4) * lda;
+	a06 = a + posY * 2 + (posX + 5) * lda;
+	a07 = a + posY * 2 + (posX + 6) * lda;
+	a08 = a + posY * 2 + (posX + 7) * lda;
+	a09 = a + posY * 2 + (posX + 8) * lda;
+	a10 = a + posY * 2 + (posX + 9) * lda;
+	a11 = a + posY * 2 + (posX + 10) * lda;
+	a12 = a + posY * 2 + (posX + 11) * lda;
+	a13 = a + posY * 2 + (posX + 12) * lda;
+	a14 = a + posY * 2 + (posX + 13) * lda;
+	a15 = a + posY * 2 + (posX + 14) * lda;
+	a16 = a + posY * 2 + (posX + 15) * lda;
+      }
+
+      i = (m >> 4);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+		a01 += 32;
+	    a02 += 32;
+	    a03 += 32;
+	    a04 += 32;
+	    a05 += 32;
+	    a06 += 32;
+	    a07 += 32;
+	    a08 += 32;
+	    a09 += 32;
+	    a10 += 32;
+	    a11 += 32;
+	    a12 += 32;
+	    a13 += 32;
+	    a14 += 32;
+	    a15 += 32;
+	    a16 += 32;
+
+	    b += 512;
+	  } else
+	    if (X > posY) {
+			for (ii = 0; ii < 16; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+		  b[ 16] = *(a01 + 16);
+	      b[ 17] = *(a01 + 17);
+	      b[ 18] = *(a01 + 18);
+	      b[ 19] = *(a01 + 19);
+	      b[ 20] = *(a01 + 20);
+	      b[ 21] = *(a01 + 21);
+	      b[ 22] = *(a01 + 22);
+	      b[ 23] = *(a01 + 23);
+
+	      b[ 24] = *(a01 + 24);
+	      b[ 25] = *(a01 + 25);
+	      b[ 26] = *(a01 + 26);
+	      b[ 27] = *(a01 + 27);
+	      b[ 28] = *(a01 + 28);
+	      b[ 29] = *(a01 + 29);
+	      b[ 30] = *(a01 + 30);
+	      b[ 31] = *(a01 + 31);
+
+	      a01 += lda;
+	      b += 32;
+	    }
+
+		a02 += 16 * lda;
+		a03 += 16 * lda;
+		a04 += 16 * lda;
+		a05 += 16 * lda;
+		a06 += 16 * lda;
+		a07 += 16 * lda;
+		a08 += 16 * lda;
+		a09 += 16 * lda;
+		a10 += 16 * lda;
+		a11 += 16 * lda;
+		a12 += 16 * lda;
+		a13 += 16 * lda;
+		a14 += 16 * lda;
+		a15 += 16 * lda;
+		a16 += 16 * lda;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+		  b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+		  b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+
+	      b[ 32] = *(a02 +  0);
+	      b[ 33] = *(a02 +  1);
+#ifdef UNIT
+	      b[ 34] = ONE;
+	      b[ 35] = ZERO;
+#else
+	      b[ 34] = *(a02 +  2);
+	      b[ 35] = *(a02 +  3);
+#endif
+	      b[ 36] = ZERO;
+	      b[ 37] = ZERO;
+	      b[ 38] = ZERO;
+	      b[ 39] = ZERO;
+	      b[ 40] = ZERO;
+	      b[ 41] = ZERO;
+	      b[ 42] = ZERO;
+	      b[ 43] = ZERO;
+	      b[ 44] = ZERO;
+	      b[ 45] = ZERO;
+	      b[ 46] = ZERO;
+	      b[ 47] = ZERO;
+	      b[ 48] = ZERO;
+	      b[ 49] = ZERO;
+	      b[ 50] = ZERO;
+	      b[ 51] = ZERO;
+	      b[ 52] = ZERO;
+	      b[ 53] = ZERO;
+	      b[ 54] = ZERO;
+	      b[ 55] = ZERO;
+	      b[ 56] = ZERO;
+	      b[ 57] = ZERO;
+	      b[ 58] = ZERO;
+	      b[ 59] = ZERO;
+	      b[ 60] = ZERO;
+	      b[ 61] = ZERO;
+	      b[ 62] = ZERO;
+	      b[ 63] = ZERO;
+
+	      b[ 64] = *(a03 +  0);
+	      b[ 65] = *(a03 +  1);
+	      b[ 66] = *(a03 +  2);
+	      b[ 67] = *(a03 +  3);
+#ifdef UNIT
+	      b[ 68] = ONE;
+	      b[ 69] = ZERO;
+#else
+	      b[ 68] = *(a03 +  4);
+	      b[ 69] = *(a03 +  5);
+#endif
+	      b[ 70] = ZERO;
+	      b[ 71] = ZERO;
+	      b[ 72] = ZERO;
+	      b[ 73] = ZERO;
+	      b[ 74] = ZERO;
+	      b[ 75] = ZERO;
+	      b[ 76] = ZERO;
+	      b[ 77] = ZERO;
+	      b[ 78] = ZERO;
+	      b[ 79] = ZERO;
+	      b[ 80] = ZERO;
+	      b[ 81] = ZERO;
+	      b[ 82] = ZERO;
+		  b[ 83] = ZERO;
+	      b[ 84] = ZERO;
+	      b[ 85] = ZERO;
+	      b[ 86] = ZERO;
+	      b[ 87] = ZERO;
+	      b[ 88] = ZERO;
+	      b[ 89] = ZERO;
+	      b[ 90] = ZERO;
+	      b[ 91] = ZERO;
+	      b[ 92] = ZERO;
+	      b[ 93] = ZERO;
+	      b[ 94] = ZERO;
+	      b[ 95] = ZERO;
+
+	      b[ 96] = *(a04 +  0);
+	      b[ 97] = *(a04 +  1);
+	      b[ 98] = *(a04 +  2);
+	      b[ 99] = *(a04 +  3);
+	      b[100] = *(a04 +  4);
+	      b[101] = *(a04 +  5);
+#ifdef UNIT
+	      b[102] = ONE;
+	      b[103] = ZERO;
+#else
+	      b[102] = *(a04 +  6);
+	      b[103] = *(a04 +  7);
+#endif
+	      b[104] = ZERO;
+	      b[105] = ZERO;
+	      b[106] = ZERO;
+	      b[107] = ZERO;
+	      b[108] = ZERO;
+	      b[109] = ZERO;
+	      b[110] = ZERO;
+	      b[111] = ZERO;
+	      b[112] = ZERO;
+	      b[113] = ZERO;
+	      b[114] = ZERO;
+	      b[115] = ZERO;
+		  b[116] = ZERO;
+	      b[117] = ZERO;
+	      b[118] = ZERO;
+	      b[119] = ZERO;
+	      b[120] = ZERO;
+	      b[121] = ZERO;
+	      b[122] = ZERO;
+	      b[123] = ZERO;
+	      b[124] = ZERO;
+	      b[125] = ZERO;
+	      b[126] = ZERO;
+	      b[127] = ZERO;
+
+	      b[128] = *(a05 +  0);
+	      b[129] = *(a05 +  1);
+	      b[130] = *(a05 +  2);
+	      b[131] = *(a05 +  3);
+	      b[132] = *(a05 +  4);
+	      b[133] = *(a05 +  5);
+	      b[134] = *(a05 +  6);
+	      b[135] = *(a05 +  7);
+#ifdef UNIT
+	      b[136] = ONE;
+	      b[137] = ZERO;
+#else
+	      b[136] = *(a05 +  8);
+	      b[137] = *(a05 +  9);
+#endif
+	      b[138] = ZERO;
+	      b[139] = ZERO;
+	      b[140] = ZERO;
+	      b[141] = ZERO;
+	      b[142] = ZERO;
+	      b[143] = ZERO;
+	      b[144] = ZERO;
+	      b[145] = ZERO;
+	      b[146] = ZERO;
+	      b[147] = ZERO;
+	      b[148] = ZERO;
+		  b[149] = ZERO;
+	      b[150] = ZERO;
+	      b[151] = ZERO;
+	      b[152] = ZERO;
+	      b[153] = ZERO;
+	      b[154] = ZERO;
+	      b[155] = ZERO;
+	      b[156] = ZERO;
+	      b[157] = ZERO;
+	      b[158] = ZERO;
+	      b[159] = ZERO;
+
+	      b[160] = *(a06 +  0);
+	      b[161] = *(a06 +  1);
+	      b[162] = *(a06 +  2);
+	      b[163] = *(a06 +  3);
+	      b[164] = *(a06 +  4);
+	      b[165] = *(a06 +  5);
+	      b[166] = *(a06 +  6);
+	      b[167] = *(a06 +  7);
+	      b[168] = *(a06 +  8);
+	      b[169] = *(a06 +  9);
+#ifdef UNIT
+	      b[170] = ONE;
+	      b[171] = ZERO;
+#else
+	      b[170] = *(a06 + 10);
+	      b[171] = *(a06 + 11);
+#endif
+	      b[172] = ZERO;
+	      b[173] = ZERO;
+	      b[174] = ZERO;
+	      b[175] = ZERO;
+	      b[176] = ZERO;
+	      b[177] = ZERO;
+	      b[178] = ZERO;
+	      b[179] = ZERO;
+	      b[180] = ZERO;
+	      b[181] = ZERO;
+		  b[182] = ZERO;
+	      b[183] = ZERO;
+	      b[184] = ZERO;
+	      b[185] = ZERO;
+	      b[186] = ZERO;
+	      b[187] = ZERO;
+	      b[188] = ZERO;
+	      b[189] = ZERO;
+	      b[190] = ZERO;
+	      b[191] = ZERO;
+
+	      b[192] = *(a07 + 0);
+	      b[193] = *(a07 + 1);
+	      b[194] = *(a07 + 2);
+	      b[195] = *(a07 + 3);
+	      b[196] = *(a07 + 4);
+	      b[197] = *(a07 + 5);
+	      b[198] = *(a07 + 6);
+	      b[199] = *(a07 + 7);
+	      b[200] = *(a07 + 8);
+	      b[201] = *(a07 + 9);
+	      b[202] = *(a07 + 10);
+	      b[203] = *(a07 + 11);
+#ifdef UNIT
+	      b[204] = ONE;
+	      b[205] = ZERO;
+#else
+	      b[204] = *(a07 + 12);
+	      b[205] = *(a07 + 13);
+#endif
+	      b[206] = ZERO;
+	      b[207] = ZERO;
+	      b[208] = ZERO;
+	      b[209] = ZERO;
+	      b[210] = ZERO;
+	      b[211] = ZERO;
+	      b[212] = ZERO;
+	      b[213] = ZERO;
+	      b[214] = ZERO;
+		  b[215] = ZERO;
+	      b[216] = ZERO;
+	      b[217] = ZERO;
+	      b[218] = ZERO;
+	      b[219] = ZERO;
+	      b[220] = ZERO;
+	      b[221] = ZERO;
+	      b[222] = ZERO;
+	      b[223] = ZERO;
+
+	      b[224] = *(a08 + 0);
+	      b[225] = *(a08 + 1);
+	      b[226] = *(a08 + 2);
+	      b[227] = *(a08 + 3);
+	      b[228] = *(a08 + 4);
+	      b[229] = *(a08 + 5);
+	      b[230] = *(a08 + 6);
+	      b[231] = *(a08 + 7);
+	      b[232] = *(a08 + 8);
+	      b[233] = *(a08 + 9);
+	      b[234] = *(a08 + 10);
+	      b[235] = *(a08 + 11);
+	      b[236] = *(a08 + 12);
+	      b[237] = *(a08 + 13);
+#ifdef UNIT
+	      b[238] = ONE;
+	      b[239] = ZERO;
+#else
+	      b[238] = *(a08 + 14);
+	      b[239] = *(a08 + 15);
+#endif
+		  b[240] = ZERO;
+	      b[241] = ZERO;
+	      b[242] = ZERO;
+	      b[243] = ZERO;
+	      b[244] = ZERO;
+	      b[245] = ZERO;
+	      b[246] = ZERO;
+	      b[247] = ZERO;
+		  b[248] = ZERO;
+	      b[249] = ZERO;
+	      b[250] = ZERO;
+	      b[251] = ZERO;
+	      b[252] = ZERO;
+	      b[253] = ZERO;
+	      b[254] = ZERO;
+	      b[255] = ZERO;
+
+	      b[256] = *(a09 + 0);
+	      b[257] = *(a09 + 1);
+	      b[258] = *(a09 + 2);
+	      b[259] = *(a09 + 3);
+	      b[260] = *(a09 + 4);
+	      b[261] = *(a09 + 5);
+	      b[262] = *(a09 + 6);
+	      b[263] = *(a09 + 7);
+	      b[264] = *(a09 + 8);
+	      b[265] = *(a09 + 9);
+	      b[266] = *(a09 + 10);
+	      b[267] = *(a09 + 11);
+	      b[268] = *(a09 + 12);
+	      b[269] = *(a09 + 13);
+	      b[270] = *(a09 + 14);
+	      b[271] = *(a09 + 15);
+#ifdef UNIT
+	      b[272] = ONE;
+		  b[273] = ZERO;
+#else
+	      b[272] = *(a09 + 16);
+		  b[273] = *(a09 + 17);
+#endif
+		  b[274] = ZERO;
+	      b[275] = ZERO;
+	      b[276] = ZERO;
+	      b[277] = ZERO;
+	      b[278] = ZERO;
+	      b[279] = ZERO;
+	      b[280] = ZERO;
+		  b[281] = ZERO;
+	      b[282] = ZERO;
+	      b[283] = ZERO;
+	      b[284] = ZERO;
+	      b[285] = ZERO;
+	      b[286] = ZERO;
+	      b[287] = ZERO;
+
+		  b[288] = *(a10 + 0);
+		  b[289] = *(a10 + 1);
+	      b[290] = *(a10 + 2);
+	      b[291] = *(a10 + 3);
+	      b[292] = *(a10 + 4);
+	      b[293] = *(a10 + 5);
+	      b[294] = *(a10 + 6);
+	      b[295] = *(a10 + 7);
+	      b[296] = *(a10 + 8);
+	      b[297] = *(a10 + 9);
+	      b[298] = *(a10 + 10);
+	      b[299] = *(a10 + 11);
+	      b[300] = *(a10 + 12);
+	      b[301] = *(a10 + 13);
+	      b[302] = *(a10 + 14);
+	      b[303] = *(a10 + 15);
+	      b[304] = *(a10 + 16);
+	      b[305] = *(a10 + 17);
+#ifdef UNIT
+	      b[306] = ONE;
+		  b[307] = ZERO;
+#else
+	      b[306] = *(a10 + 18);
+		  b[307] = *(a10 + 19);
+#endif
+		  b[308] = ZERO;
+	      b[309] = ZERO;
+	      b[310] = ZERO;
+	      b[311] = ZERO;
+	      b[312] = ZERO;
+	      b[313] = ZERO;
+		  b[314] = ZERO;
+	      b[315] = ZERO;
+	      b[316] = ZERO;
+	      b[317] = ZERO;
+	      b[318] = ZERO;
+	      b[319] = ZERO;
+
+		  b[320] = *(a11 + 0);
+		  b[321] = *(a11 + 1);
+	      b[322] = *(a11 + 2);
+	      b[323] = *(a11 + 3);
+	      b[324] = *(a11 + 4);
+	      b[325] = *(a11 + 5);
+	      b[326] = *(a11 + 6);
+	      b[327] = *(a11 + 7);
+	      b[328] = *(a11 + 8);
+	      b[329] = *(a11 + 9);
+	      b[330] = *(a11 + 10);
+	      b[331] = *(a11 + 11);
+	      b[332] = *(a11 + 12);
+	      b[333] = *(a11 + 13);
+	      b[334] = *(a11 + 14);
+	      b[335] = *(a11 + 15);
+	      b[336] = *(a11 + 16);
+	      b[337] = *(a11 + 17);
+	      b[338] = *(a11 + 18);
+	      b[339] = *(a11 + 19);
+#ifdef UNIT
+	      b[340] = ONE;
+	      b[341] = ZERO;
+#else
+	      b[340] = *(a11 + 20);
+	      b[341] = *(a11 + 21);
+#endif
+		  b[342] = ZERO;
+	      b[343] = ZERO;
+	      b[344] = ZERO;
+	      b[345] = ZERO;
+	      b[346] = ZERO;
+		  b[347] = ZERO;
+	      b[348] = ZERO;
+	      b[349] = ZERO;
+	      b[350] = ZERO;
+	      b[351] = ZERO;
+
+		  b[352] = *(a12 + 0);
+		  b[353] = *(a12 + 1);
+	      b[354] = *(a12 + 2);
+	      b[355] = *(a12 + 3);
+	      b[356] = *(a12 + 4);
+	      b[357] = *(a12 + 5);
+	      b[358] = *(a12 + 6);
+	      b[359] = *(a12 + 7);
+	      b[360] = *(a12 + 8);
+	      b[361] = *(a12 + 9);
+	      b[362] = *(a12 + 10);
+	      b[363] = *(a12 + 11);
+	      b[364] = *(a12 + 12);
+	      b[365] = *(a12 + 13);
+	      b[366] = *(a12 + 14);
+	      b[367] = *(a12 + 15);
+	      b[368] = *(a12 + 16);
+	      b[369] = *(a12 + 17);
+	      b[370] = *(a12 + 18);
+	      b[371] = *(a12 + 19);
+	      b[372] = *(a12 + 20);
+	      b[373] = *(a12 + 21);
+#ifdef UNIT
+	      b[374] = ONE;
+	      b[375] = ZERO;
+#else
+	      b[374] = *(a12 + 22);
+	      b[375] = *(a12 + 23);
+#endif
+		  b[376] = ZERO;
+	      b[377] = ZERO;
+	      b[378] = ZERO;
+	      b[379] = ZERO;
+		  b[380] = ZERO;
+	      b[381] = ZERO;
+	      b[382] = ZERO;
+	      b[383] = ZERO;
+
+		  b[384] = *(a13 + 0);
+		  b[385] = *(a13 + 1);
+	      b[386] = *(a13 + 2);
+	      b[387] = *(a13 + 3);
+	      b[388] = *(a13 + 4);
+	      b[389] = *(a13 + 5);
+	      b[390] = *(a13 + 6);
+	      b[391] = *(a13 + 7);
+	      b[392] = *(a13 + 8);
+	      b[393] = *(a13 + 9);
+	      b[394] = *(a13 + 10);
+	      b[395] = *(a13 + 11);
+	      b[396] = *(a13 + 12);
+	      b[397] = *(a13 + 13);
+	      b[398] = *(a13 + 14);
+	      b[399] = *(a13 + 15);
+	      b[400] = *(a13 + 16);
+	      b[401] = *(a13 + 17);
+	      b[402] = *(a13 + 18);
+	      b[403] = *(a13 + 19);
+	      b[404] = *(a13 + 20);
+	      b[405] = *(a13 + 21);
+	      b[406] = *(a13 + 22);
+	      b[407] = *(a13 + 23);
+#ifdef UNIT
+	      b[408] = ONE;
+	      b[409] = ZERO;
+#else
+	      b[408] = *(a13 + 24);
+	      b[409] = *(a13 + 25);
+#endif
+		  b[410] = ZERO;
+	      b[411] = ZERO;
+	      b[412] = ZERO;
+		  b[413] = ZERO;
+	      b[414] = ZERO;
+	      b[415] = ZERO;
+
+		  b[416] = *(a14 + 0);
+		  b[417] = *(a14 + 1);
+	      b[418] = *(a14 + 2);
+	      b[419] = *(a14 + 3);
+	      b[420] = *(a14 + 4);
+	      b[421] = *(a14 + 5);
+	      b[422] = *(a14 + 6);
+	      b[423] = *(a14 + 7);
+	      b[424] = *(a14 + 8);
+	      b[425] = *(a14 + 9);
+	      b[426] = *(a14 + 10);
+	      b[427] = *(a14 + 11);
+	      b[428] = *(a14 + 12);
+	      b[429] = *(a14 + 13);
+	      b[430] = *(a14 + 14);
+	      b[431] = *(a14 + 15);
+	      b[432] = *(a14 + 16);
+	      b[433] = *(a14 + 17);
+	      b[434] = *(a14 + 18);
+	      b[435] = *(a14 + 19);
+	      b[436] = *(a14 + 20);
+	      b[437] = *(a14 + 21);
+	      b[438] = *(a14 + 22);
+	      b[439] = *(a14 + 23);
+	      b[440] = *(a14 + 24);
+	      b[441] = *(a14 + 25);
+#ifdef UNIT
+	      b[442] = ONE;
+	      b[443] = ZERO;
+#else
+	      b[442] = *(a14 + 26);
+	      b[443] = *(a14 + 27);
+#endif
+		  b[444] = ZERO;
+	      b[445] = ZERO;
+		  b[446] = ZERO;
+	      b[447] = ZERO;
+
+		  b[448] = *(a15 + 0);
+		  b[449] = *(a15 + 1);
+	      b[450] = *(a15 + 2);
+	      b[451] = *(a15 + 3);
+	      b[452] = *(a15 + 4);
+	      b[453] = *(a15 + 5);
+	      b[454] = *(a15 + 6);
+	      b[455] = *(a15 + 7);
+	      b[456] = *(a15 + 8);
+	      b[457] = *(a15 + 9);
+	      b[458] = *(a15 + 10);
+	      b[459] = *(a15 + 11);
+	      b[460] = *(a15 + 12);
+	      b[461] = *(a15 + 13);
+	      b[462] = *(a15 + 14);
+	      b[463] = *(a15 + 15);
+	      b[464] = *(a15 + 16);
+	      b[465] = *(a15 + 17);
+	      b[466] = *(a15 + 18);
+	      b[467] = *(a15 + 19);
+	      b[468] = *(a15 + 20);
+	      b[469] = *(a15 + 21);
+	      b[470] = *(a15 + 22);
+	      b[471] = *(a15 + 23);
+	      b[472] = *(a15 + 24);
+	      b[473] = *(a15 + 25);
+	      b[474] = *(a15 + 26);
+	      b[475] = *(a15 + 27);
+#ifdef UNIT
+	      b[476] = ONE;
+	      b[477] = ZERO;
+#else
+	      b[476] = *(a15 + 28);
+	      b[477] = *(a15 + 29);
+#endif
+		  b[478] = ZERO;
+		  b[479] = ZERO;
+
+		  b[480] = *(a16 + 0);
+		  b[481] = *(a16 + 1);
+	      b[482] = *(a16 + 2);
+	      b[483] = *(a16 + 3);
+	      b[484] = *(a16 + 4);
+	      b[485] = *(a16 + 5);
+	      b[486] = *(a16 + 6);
+	      b[487] = *(a16 + 7);
+	      b[488] = *(a16 + 8);
+	      b[489] = *(a16 + 9);
+	      b[490] = *(a16 + 10);
+	      b[491] = *(a16 + 11);
+	      b[492] = *(a16 + 12);
+	      b[493] = *(a16 + 13);
+	      b[494] = *(a16 + 14);
+	      b[495] = *(a16 + 15);
+	      b[496] = *(a16 + 16);
+	      b[497] = *(a16 + 17);
+	      b[498] = *(a16 + 18);
+	      b[499] = *(a16 + 19);
+	      b[500] = *(a16 + 20);
+	      b[501] = *(a16 + 21);
+	      b[502] = *(a16 + 22);
+	      b[503] = *(a16 + 23);
+	      b[504] = *(a16 + 24);
+	      b[505] = *(a16 + 25);
+	      b[506] = *(a16 + 26);
+	      b[507] = *(a16 + 27);
+	      b[508] = *(a16 + 28);
+	      b[509] = *(a16 + 29);
+#ifdef UNIT
+	      b[510] = ONE;
+	      b[511] = ZERO;
+#else
+	      b[510] = *(a16 + 30);
+	      b[511] = *(a16 + 31);
+#endif
+
+	      a01 += 16 * lda;
+	      a02 += 16 * lda;
+	      a03 += 16 * lda;
+	      a04 += 16 * lda;
+	      a05 += 16 * lda;
+	      a06 += 16 * lda;
+	      a07 += 16 * lda;
+	      a08 += 16 * lda;
+	      a09 += 16 * lda;
+	      a10 += 16 * lda;
+	      a11 += 16 * lda;
+	      a12 += 16 * lda;
+	      a13 += 16 * lda;
+	      a14 += 16 * lda;
+	      a15 += 16 * lda;
+	      a16 += 16 * lda;
+	      b += 512;
+	    }
+
+	  X += 16;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 15);
+      if (i) {
+
+	if (X < posY) {
+	//   a01 += 2 * i;
+	//   a02 += 2 * i;
+	//   a03 += 2 * i;
+	//   a04 += 2 * i;
+	//   a05 += 2 * i;
+	//   a06 += 2 * i;
+	//   a07 += 2 * i;
+	//   a08 += 2 * i;
+	//   a09 += 2 * i;
+	//   a10 += 2 * i;
+	//   a11 += 2 * i;
+	//   a12 += 2 * i;
+	//   a13 += 2 * i;
+	//   a14 += 2 * i;
+	//   a15 += 2 * i;
+	//   a16 += 2 * i;
+	  b += 32 * i;
+
+	} else
+	  if (X > posY) {
+	    for (ii = 0; ii < i; ii++){
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+		b[  2] = *(a01 +  2);
+		b[  3] = *(a01 +  3);
+		b[  4] = *(a01 +  4);
+		b[  5] = *(a01 +  5);
+		b[  6] = *(a01 +  6);
+		b[  7] = *(a01 +  7);
+		b[  8] = *(a01 +  8);
+		b[  9] = *(a01 +  9);
+		b[ 10] = *(a01 + 10);
+		b[ 11] = *(a01 + 11);
+		b[ 12] = *(a01 + 12);
+		b[ 13] = *(a01 + 13);
+		b[ 14] = *(a01 + 14);
+		b[ 15] = *(a01 + 15);
+
+		b[ 16] = *(a01 + 16);
+		b[ 17] = *(a01 + 17);
+		b[ 18] = *(a01 + 18);
+		b[ 19] = *(a01 + 19);
+		b[ 20] = *(a01 + 20);
+		b[ 21] = *(a01 + 21);
+		b[ 22] = *(a01 + 22);
+		b[ 23] = *(a01 + 23);
+		b[ 24] = *(a01 + 24);
+		b[ 25] = *(a01 + 25);
+		b[ 26] = *(a01 + 26);
+		b[ 27] = *(a01 + 27);
+		b[ 28] = *(a01 + 28);
+		b[ 29] = *(a01 + 29);
+		b[ 30] = *(a01 + 30);
+		b[ 31] = *(a01 + 31);
+
+	    a01 += lda;
+	    a02 += lda;
+	    a03 += lda;
+	    a04 += lda;
+	    a05 += lda;
+	    a06 += lda;
+	    a07 += lda;
+	    a08 += lda;
+		a09 += lda;
+	    a10 += lda;
+	    a11 += lda;
+	    a12 += lda;
+	    a13 += lda;
+	    a14 += lda;
+	    a15 += lda;
+	    a16 += lda;
+	    b += 32;
+	  }
+	  } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+		  b[ 16] = ZERO;
+	      b[ 17] = ZERO;
+	      b[ 18] = ZERO;
+	      b[ 19] = ZERO;
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+		  b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+	      b += 32;
+
+	      if (i >= 2) {
+		b[ 0] = *(a02 +  0);
+		b[ 1] = *(a02 +  1);
+#ifdef UNIT
+		b[ 2] = ONE;
+		b[ 3] = ZERO;
+#else
+		b[ 2] = *(a02 +  2);
+		b[ 3] = *(a02 +  3);
+#endif
+		b[  4] = ZERO;
+		b[  5] = ZERO;
+		b[  6] = ZERO;
+		b[  7] = ZERO;
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 3) {
+		b[ 0] = *(a03 +  0);
+		b[ 1] = *(a03 +  1);
+		b[ 2] = *(a03 +  2);
+		b[ 3] = *(a03 +  3);
+#ifdef UNIT
+		b[ 4] = ONE;
+		b[ 5] = ZERO;
+#else
+		b[ 4] = *(a03 +  4);
+		b[ 5] = *(a03 +  5);
+#endif
+		b[  6] = ZERO;
+		b[  7] = ZERO;
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 4) {
+		b[ 0] = *(a04 +  0);
+		b[ 1] = *(a04 +  1);
+		b[ 2] = *(a04 +  2);
+		b[ 3] = *(a04 +  3);
+		b[ 4] = *(a04 +  4);
+		b[ 5] = *(a04 +  5);
+#ifdef UNIT
+		b[ 6] = ONE;
+		b[ 7] = ZERO;
+#else
+		b[ 6] = *(a04 +  6);
+		b[ 7] = *(a04 +  7);
+#endif
+		b[  8] = ZERO;
+		b[  9] = ZERO;
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 5) {
+		b[ 0] = *(a05 +  0);
+		b[ 1] = *(a05 +  1);
+		b[ 2] = *(a05 +  2);
+		b[ 3] = *(a05 +  3);
+		b[ 4] = *(a05 +  4);
+		b[ 5] = *(a05 +  5);
+		b[ 6] = *(a05 +  6);
+		b[ 7] = *(a05 +  7);
+#ifdef UNIT
+		b[ 8] = ONE;
+		b[ 9] = ZERO;
+#else
+		b[ 8] = *(a05 +  8);
+		b[ 9] = *(a05 +  9);
+#endif
+		b[ 10] = ZERO;
+		b[ 11] = ZERO;
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 6) {
+		b[ 0] = *(a06 + 0);
+		b[ 1] = *(a06 + 1);
+		b[ 2] = *(a06 + 2);
+		b[ 3] = *(a06 + 3);
+		b[ 4] = *(a06 + 4);
+		b[ 5] = *(a06 + 5);
+		b[ 6] = *(a06 + 6);
+		b[ 7] = *(a06 + 7);
+		b[ 8] = *(a06 + 8);
+		b[ 9] = *(a06 + 9);
+#ifdef UNIT
+		b[10] = ONE;
+		b[11] = ZERO;
+#else
+		b[10] = *(a06 + 10);
+		b[11] = *(a06 + 11);
+#endif
+		b[ 12] = ZERO;
+		b[ 13] = ZERO;
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	      }
+
+	      if (i >= 7) {
+		b[ 0] = *(a07 + 0);
+		b[ 1] = *(a07 + 1);
+		b[ 2] = *(a07 + 2);
+		b[ 3] = *(a07 + 3);
+		b[ 4] = *(a07 + 4);
+		b[ 5] = *(a07 + 5);
+		b[ 6] = *(a07 + 6);
+		b[ 7] = *(a07 + 7);
+		b[ 8] = *(a07 + 8);
+		b[ 9] = *(a07 + 9);
+		b[10] = *(a07 + 10);
+		b[11] = *(a07 + 11);
+#ifdef UNIT
+		b[12] = ONE;
+		b[13] = ZERO;
+#else
+		b[12] = *(a07 + 12);
+		b[13] = *(a07 + 13);
+#endif
+		b[ 14] = ZERO;
+		b[ 15] = ZERO;
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+		b += 32;
+	    }
+
+		if (i >= 8) {
+	      b[  0] = *(a08 + 0);
+	      b[  1] = *(a08 + 1);
+	      b[  2] = *(a08 + 2);
+	      b[  3] = *(a08 + 3);
+	      b[  4] = *(a08 + 4);
+	      b[  5] = *(a08 + 5);
+	      b[  6] = *(a08 + 6);
+	      b[  7] = *(a08 + 7);
+	      b[  8] = *(a08 + 8);
+	      b[  9] = *(a08 + 9);
+	      b[ 10] = *(a08 + 10);
+	      b[ 11] = *(a08 + 11);
+	      b[ 12] = *(a08 + 12);
+	      b[ 13] = *(a08 + 13);
+#ifdef UNIT
+	      b[ 14] = ONE;
+	      b[ 15] = ZERO;
+#else
+	      b[ 14] = *(a08 +  14);
+	      b[ 15] = *(a08 +  15);
+#endif
+		b[ 16] = ZERO;
+		b[ 17] = ZERO;
+		b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 9) {
+	      b[  0] = *(a09 + 0);
+	      b[  1] = *(a09 + 1);
+	      b[  2] = *(a09 + 2);
+	      b[  3] = *(a09 + 3);
+	      b[  4] = *(a09 + 4);
+	      b[  5] = *(a09 + 5);
+	      b[  6] = *(a09 + 6);
+	      b[  7] = *(a09 + 7);
+	      b[  8] = *(a09 + 8);
+	      b[  9] = *(a09 + 9);
+	      b[ 10] = *(a09 + 10);
+	      b[ 11] = *(a09 + 11);
+	      b[ 12] = *(a09 + 12);
+	      b[ 13] = *(a09 + 13);
+	      b[ 14] = *(a09 + 14);
+	      b[ 15] = *(a09 + 15);
+#ifdef UNIT
+	      b[ 16] = ONE;
+	      b[ 17] = ZERO;
+#else
+	      b[ 16] = *(a09 + 16);
+	      b[ 17] = *(a09 + 17);
+#endif
+	    b[ 18] = ZERO;
+		b[ 19] = ZERO;
+		b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 10) {
+	      b[  0] = *(a10 + 0);
+	      b[  1] = *(a10 + 1);
+	      b[  2] = *(a10 + 2);
+	      b[  3] = *(a10 + 3);
+	      b[  4] = *(a10 + 4);
+	      b[  5] = *(a10 + 5);
+	      b[  6] = *(a10 + 6);
+	      b[  7] = *(a10 + 7);
+	      b[  8] = *(a10 + 8);
+	      b[  9] = *(a10 + 9);
+	      b[ 10] = *(a10 + 10);
+	      b[ 11] = *(a10 + 11);
+	      b[ 12] = *(a10 + 12);
+	      b[ 13] = *(a10 + 13);
+	      b[ 14] = *(a10 + 14);
+	      b[ 15] = *(a10 + 15);
+	      b[ 16] = *(a10 + 16);
+	      b[ 17] = *(a10 + 17);
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a10 + 18);
+	      b[ 19] = *(a10 + 19);
+#endif
+	    b[ 20] = ZERO;
+		b[ 21] = ZERO;
+		b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 11) {
+	      b[  0] = *(a11 + 0);
+	      b[  1] = *(a11 + 1);
+	      b[  2] = *(a11 + 2);
+	      b[  3] = *(a11 + 3);
+	      b[  4] = *(a11 + 4);
+	      b[  5] = *(a11 + 5);
+	      b[  6] = *(a11 + 6);
+	      b[  7] = *(a11 + 7);
+	      b[  8] = *(a11 + 8);
+	      b[  9] = *(a11 + 9);
+	      b[ 10] = *(a11 + 10);
+	      b[ 11] = *(a11 + 11);
+	      b[ 12] = *(a11 + 12);
+	      b[ 13] = *(a11 + 13);
+	      b[ 14] = *(a11 + 14);
+	      b[ 15] = *(a11 + 15);
+	      b[ 16] = *(a11 + 16);
+	      b[ 17] = *(a11 + 17);
+	      b[ 18] = *(a11 + 18);
+	      b[ 19] = *(a11 + 19);
+#ifdef UNIT
+	      b[ 20] = ONE;
+	      b[ 21] = ZERO;
+#else
+	      b[ 20] = *(a11 + 20);
+	      b[ 21] = *(a11 + 21);
+#endif
+	    b[ 22] = ZERO;
+		b[ 23] = ZERO;
+		b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 12) {
+	      b[  0] = *(a12 + 0);
+	      b[  1] = *(a12 + 1);
+	      b[  2] = *(a12 + 2);
+	      b[  3] = *(a12 + 3);
+	      b[  4] = *(a12 + 4);
+	      b[  5] = *(a12 + 5);
+	      b[  6] = *(a12 + 6);
+	      b[  7] = *(a12 + 7);
+	      b[  8] = *(a12 + 8);
+	      b[  9] = *(a12 + 9);
+	      b[ 10] = *(a12 + 10);
+	      b[ 11] = *(a12 + 11);
+	      b[ 12] = *(a12 + 12);
+	      b[ 13] = *(a12 + 13);
+	      b[ 14] = *(a12 + 14);
+	      b[ 15] = *(a12 + 15);
+	      b[ 16] = *(a12 + 16);
+	      b[ 17] = *(a12 + 17);
+	      b[ 18] = *(a12 + 18);
+	      b[ 19] = *(a12 + 19);
+	      b[ 20] = *(a12 + 20);
+	      b[ 21] = *(a12 + 21);
+#ifdef UNIT
+	      b[ 22] = ONE;
+	      b[ 23] = ZERO;
+#else
+	      b[ 22] = *(a12 + 22);
+	      b[ 23] = *(a12 + 23);
+#endif
+	    b[ 24] = ZERO;
+		b[ 25] = ZERO;
+		b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 13) {
+	      b[  0] = *(a13 + 0);
+	      b[  1] = *(a13 + 1);
+	      b[  2] = *(a13 + 2);
+	      b[  3] = *(a13 + 3);
+	      b[  4] = *(a13 + 4);
+	      b[  5] = *(a13 + 5);
+	      b[  6] = *(a13 + 6);
+	      b[  7] = *(a13 + 7);
+	      b[  8] = *(a13 + 8);
+	      b[  9] = *(a13 + 9);
+	      b[ 10] = *(a13 + 10);
+	      b[ 11] = *(a13 + 11);
+	      b[ 12] = *(a13 + 12);
+	      b[ 13] = *(a13 + 13);
+	      b[ 14] = *(a13 + 14);
+	      b[ 15] = *(a13 + 15);
+	      b[ 16] = *(a13 + 16);
+	      b[ 17] = *(a13 + 17);
+	      b[ 18] = *(a13 + 18);
+	      b[ 19] = *(a13 + 19);
+	      b[ 20] = *(a13 + 20);
+	      b[ 21] = *(a13 + 21);
+	      b[ 22] = *(a13 + 22);
+	      b[ 23] = *(a13 + 23);
+#ifdef UNIT
+	      b[ 24] = ONE;
+	      b[ 25] = ZERO;
+#else
+	      b[ 24] = *(a13 + 24);
+	      b[ 25] = *(a13 + 25);
+#endif
+	    b[ 26] = ZERO;
+		b[ 27] = ZERO;
+		b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 14) {
+	      b[  0] = *(a14 + 0);
+	      b[  1] = *(a14 + 1);
+	      b[  2] = *(a14 + 2);
+	      b[  3] = *(a14 + 3);
+	      b[  4] = *(a14 + 4);
+	      b[  5] = *(a14 + 5);
+	      b[  6] = *(a14 + 6);
+	      b[  7] = *(a14 + 7);
+	      b[  8] = *(a14 + 8);
+	      b[  9] = *(a14 + 9);
+	      b[ 10] = *(a14 + 10);
+	      b[ 11] = *(a14 + 11);
+	      b[ 12] = *(a14 + 12);
+	      b[ 13] = *(a14 + 13);
+	      b[ 14] = *(a14 + 14);
+	      b[ 15] = *(a14 + 15);
+	      b[ 16] = *(a14 + 16);
+	      b[ 17] = *(a14 + 17);
+	      b[ 18] = *(a14 + 18);
+	      b[ 19] = *(a14 + 19);
+	      b[ 20] = *(a14 + 20);
+	      b[ 21] = *(a14 + 21);
+	      b[ 22] = *(a14 + 22);
+	      b[ 23] = *(a14 + 23);
+	      b[ 24] = *(a14 + 24);
+	      b[ 25] = *(a14 + 25);
+#ifdef UNIT
+	      b[ 26] = ONE;
+	      b[ 27] = ZERO;
+#else
+	      b[ 26] = *(a14 + 26);
+	      b[ 27] = *(a14 + 27);
+#endif
+	    b[ 28] = ZERO;
+		b[ 29] = ZERO;
+		b[ 30] = ZERO;
+		b[ 31] = ZERO;
+	      b += 32;
+	    }
+
+		if (i >= 15) {
+	      b[  0] = *(a15 + 0);
+	      b[  1] = *(a15 + 1);
+	      b[  2] = *(a15 + 2);
+	      b[  3] = *(a15 + 3);
+	      b[  4] = *(a15 + 4);
+	      b[  5] = *(a15 + 5);
+	      b[  6] = *(a15 + 6);
+	      b[  7] = *(a15 + 7);
+	      b[  8] = *(a15 + 8);
+	      b[  9] = *(a15 + 9);
+	      b[ 10] = *(a15 + 10);
+	      b[ 11] = *(a15 + 11);
+	      b[ 12] = *(a15 + 12);
+	      b[ 13] = *(a15 + 13);
+	      b[ 14] = *(a15 + 14);
+	      b[ 15] = *(a15 + 15);
+	      b[ 16] = *(a15 + 16);
+	      b[ 17] = *(a15 + 17);
+	      b[ 18] = *(a15 + 18);
+	      b[ 19] = *(a15 + 19);
+	      b[ 20] = *(a15 + 20);
+	      b[ 21] = *(a15 + 21);
+	      b[ 22] = *(a15 + 22);
+	      b[ 23] = *(a15 + 23);
+	      b[ 24] = *(a15 + 24);
+	      b[ 25] = *(a15 + 25);
+	      b[ 26] = *(a15 + 26);
+	      b[ 27] = *(a15 + 27);
+#ifdef UNIT
+	      b[ 28] = ONE;
+	      b[ 29] = ZERO;
+#else
+	      b[ 28] = *(a15 + 28);
+	      b[ 29] = *(a15 + 29);
+#endif
+	      b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+	      b += 32;
+	    }
+	  }
+      }
+
+      posY += 16;
+      js --;
+    } while (js > 0);
+  } /* End of main loop */
+
+
+  if (n & 8){
+      X = posX;
+
+      if (posX <= posY) {
+	a01 = a + posX * 2 + (posY +  0) * lda;
+	a02 = a + posX * 2 + (posY +  1) * lda;
+	a03 = a + posX * 2 + (posY +  2) * lda;
+	a04 = a + posX * 2 + (posY +  3) * lda;
+	a05 = a + posX * 2 + (posY +  4) * lda;
+	a06 = a + posX * 2 + (posY +  5) * lda;
+	a07 = a + posX * 2 + (posY +  6) * lda;
+	a08 = a + posX * 2 + (posY +  7) * lda;
+      } else {
+	a01 = a + posY * 2 + (posX +  0) * lda;
+	a02 = a + posY * 2 + (posX +  1) * lda;
+	a03 = a + posY * 2 + (posX +  2) * lda;
+	a04 = a + posY * 2 + (posX +  3) * lda;
+	a05 = a + posY * 2 + (posX +  4) * lda;
+	a06 = a + posY * 2 + (posX +  5) * lda;
+	a07 = a + posY * 2 + (posX +  6) * lda;
+	a08 = a + posY * 2 + (posX +  7) * lda;
+      }
+
+      i = (m >> 3);
+      if (i > 0) {
+	do {
+	  if (X < posY) {
+	    a01 += 16;
+	    a02 += 16;
+	    a03 += 16;
+	    a04 += 16;
+	    a05 += 16;
+	    a06 += 16;
+	    a07 += 16;
+	    a08 += 16;
+	    b += 128;
+	  } else
+	    if (X > posY) {
+	      for (ii = 0; ii < 8; ii++){
+
+		b[  0] = *(a01 +  0);
+		b[  1] = *(a01 +  1);
+		b[  2] = *(a01 +  2);
+		b[  3] = *(a01 +  3);
+		b[  4] = *(a01 +  4);
+		b[  5] = *(a01 +  5);
+		b[  6] = *(a01 +  6);
+		b[  7] = *(a01 +  7);
+
+		b[  8] = *(a01 +  8);
+		b[  9] = *(a01 +  9);
+		b[ 10] = *(a01 + 10);
+		b[ 11] = *(a01 + 11);
+		b[ 12] = *(a01 + 12);
+		b[ 13] = *(a01 + 13);
+		b[ 14] = *(a01 + 14);
+		b[ 15] = *(a01 + 15);
+
+		a01 += lda;
+		b += 16;
+	      }
+		  a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+
+	    } else {
+#ifdef UNIT
+	      b[  0] = ONE;
+	      b[  1] = ZERO;
+#else
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+#endif
+	      b[  2] = ZERO;
+	      b[  3] = ZERO;
+	      b[  4] = ZERO;
+	      b[  5] = ZERO;
+	      b[  6] = ZERO;
+	      b[  7] = ZERO;
+
+	      b[  8] = ZERO;
+	      b[  9] = ZERO;
+	      b[ 10] = ZERO;
+	      b[ 11] = ZERO;
+	      b[ 12] = ZERO;
+	      b[ 13] = ZERO;
+	      b[ 14] = ZERO;
+	      b[ 15] = ZERO;
+
+	      b[ 16] = *(a02 +  0);
+	      b[ 17] = *(a02 +  1);
+#ifdef UNIT
+	      b[ 18] = ONE;
+	      b[ 19] = ZERO;
+#else
+	      b[ 18] = *(a02 +  2);
+	      b[ 19] = *(a02 +  3);
+#endif
+	      b[ 20] = ZERO;
+	      b[ 21] = ZERO;
+	      b[ 22] = ZERO;
+	      b[ 23] = ZERO;
+	      b[ 24] = ZERO;
+	      b[ 25] = ZERO;
+	      b[ 26] = ZERO;
+	      b[ 27] = ZERO;
+	      b[ 28] = ZERO;
+	      b[ 29] = ZERO;
+	      b[ 30] = ZERO;
+	      b[ 31] = ZERO;
+
+	      b[ 32] = *(a03 +  0);
+	      b[ 33] = *(a03 +  1);
+	      b[ 34] = *(a03 +  2);
+	      b[ 35] = *(a03 +  3);
+#ifdef UNIT
+	      b[ 36] = ONE;
+	      b[ 37] = ZERO;
+#else
+	      b[ 36] = *(a03 +  4);
+	      b[ 37] = *(a03 +  5);
+#endif
+	      b[ 38] = ZERO;
+	      b[ 39] = ZERO;
+	      b[ 40] = ZERO;
+	      b[ 41] = ZERO;
+	      b[ 42] = ZERO;
+	      b[ 43] = ZERO;
+	      b[ 44] = ZERO;
+	      b[ 45] = ZERO;
+	      b[ 46] = ZERO;
+	      b[ 47] = ZERO;
+
+	      b[ 48] = *(a04 +  0);
+	      b[ 49] = *(a04 +  1);
+	      b[ 50] = *(a04 +  2);
+	      b[ 51] = *(a04 +  3);
+	      b[ 52] = *(a04 +  4);
+	      b[ 53] = *(a04 +  5);
+#ifdef UNIT
+	      b[ 54] = ONE;
+	      b[ 55] = ZERO;
+#else
+	      b[ 54] = *(a04 +  6);
+	      b[ 55] = *(a04 +  7);
+#endif
+	      b[ 56] = ZERO;
+	      b[ 57] = ZERO;
+	      b[ 58] = ZERO;
+	      b[ 59] = ZERO;
+	      b[ 60] = ZERO;
+	      b[ 61] = ZERO;
+	      b[ 62] = ZERO;
+	      b[ 63] = ZERO;
+
+	      b[ 64] = *(a05 +  0);
+	      b[ 65] = *(a05 +  1);
+	      b[ 66] = *(a05 +  2);
+	      b[ 67] = *(a05 +  3);
+	      b[ 68] = *(a05 +  4);
+	      b[ 69] = *(a05 +  5);
+	      b[ 70] = *(a05 +  6);
+	      b[ 71] = *(a05 +  7);
+#ifdef UNIT
+	      b[ 72] = ONE;
+	      b[ 73] = ZERO;
+#else
+	      b[ 72] = *(a05 +  8);
+	      b[ 73] = *(a05 +  9);
+#endif
+	      b[ 74] = ZERO;
+	      b[ 75] = ZERO;
+	      b[ 76] = ZERO;
+	      b[ 77] = ZERO;
+	      b[ 78] = ZERO;
+	      b[ 79] = ZERO;
+
+	      b[ 80] = *(a06 +  0);
+	      b[ 81] = *(a06 +  1);
+	      b[ 82] = *(a06 +  2);
+	      b[ 83] = *(a06 +  3);
+	      b[ 84] = *(a06 +  4);
+	      b[ 85] = *(a06 +  5);
+	      b[ 86] = *(a06 +  6);
+	      b[ 87] = *(a06 +  7);
+	      b[ 88] = *(a06 +  8);
+	      b[ 89] = *(a06 +  9);
+#ifdef UNIT
+	      b[ 90] = ONE;
+	      b[ 91] = ZERO;
+#else
+	      b[ 90] = *(a06 + 10);
+	      b[ 91] = *(a06 + 11);
+#endif
+	      b[ 92] = ZERO;
+	      b[ 93] = ZERO;
+	      b[ 94] = ZERO;
+	      b[ 95] = ZERO;
+
+	      b[ 96] = *(a07 +  0);
+	      b[ 97] = *(a07 +  1);
+	      b[ 98] = *(a07 +  2);
+	      b[ 99] = *(a07 +  3);
+	      b[100] = *(a07 +  4);
+	      b[101] = *(a07 +  5);
+	      b[102] = *(a07 +  6);
+	      b[103] = *(a07 +  7);
+	      b[104] = *(a07 +  8);
+	      b[105] = *(a07 +  9);
+	      b[106] = *(a07 + 10);
+	      b[107] = *(a07 + 11);
+#ifdef UNIT
+	      b[108] = ONE;
+	      b[109] = ZERO;
+#else
+	      b[108] = *(a07 + 12);
+	      b[109] = *(a07 + 13);
+#endif
+	      b[110] = ZERO;
+	      b[111] = ZERO;
+
+	      b[112] = *(a08 +  0);
+	      b[113] = *(a08 +  1);
+	      b[114] = *(a08 +  2);
+	      b[115] = *(a08 +  3);
+	      b[116] = *(a08 +  4);
+	      b[117] = *(a08 +  5);
+	      b[118] = *(a08 +  6);
+	      b[119] = *(a08 +  7);
+	      b[120] = *(a08 +  8);
+	      b[121] = *(a08 +  9);
+	      b[122] = *(a08 + 10);
+	      b[123] = *(a08 + 11);
+	      b[124] = *(a08 + 12);
+	      b[125] = *(a08 + 13);
+#ifdef UNIT
+	      b[126] = ONE;
+	      b[127] = ZERO;
+#else
+	      b[126] = *(a08 + 14);
+	      b[127] = *(a08 + 15);
+#endif
+
+	      a01 += 8 * lda;
+	      a02 += 8 * lda;
+	      a03 += 8 * lda;
+	      a04 += 8 * lda;
+	      a05 += 8 * lda;
+	      a06 += 8 * lda;
+	      a07 += 8 * lda;
+	      a08 += 8 * lda;
+	      b += 128;
+	    }
+
+	  X += 8;
+	  i --;
+	} while (i > 0);
+      }
+
+      i = (m & 7);
+      if (i) {
+
+	if (X < posY) {
+	  /* a01 += 2 * i;
+	    a02 += 2 * i;
+	    a03 += 2 * i;
+	    a04 += 2 * i;
+	    a05 += 2 * i;
+	    a06 += 2 * i;
+	    a07 += 2 * i;
+	    a08 += 2 * i; */
+	    b += 16 * i;
+	} else
+	  if (X > posY) {
+	      for (ii = 0; ii < i; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      b[  8] = *(a01 +  8);
+	      b[  9] = *(a01 +  9);
+	      b[ 10] = *(a01 + 10);
+	      b[ 11] = *(a01 + 11);
+	      b[ 12] = *(a01 + 12);
+	      b[ 13] = *(a01 + 13);
+	      b[ 14] = *(a01 + 14);
+	      b[ 15] = *(a01 + 15);
+
+	      a01 += lda;
+	      a02 += lda;
+	      a03 += lda;
+	      a04 += lda;
+	      a05 += lda;
+	      a06 += lda;
+	      a07 += lda;
+	      a08 += lda;
+	      b += 16;
+	    }
+	  } else {
+#ifdef UNIT
+	    b[ 0] = ONE;
+	    b[ 1] = ZERO;
+#else
+	    b[ 0] = *(a01 +  0);
+	    b[ 1] = *(a01 +  1);
+#endif
+	    b[ 2] = ZERO;
+	    b[ 3] = ZERO;
+	    b[ 4] = ZERO;
+	    b[ 5] = ZERO;
+	    b[ 6] = ZERO;
+	    b[ 7] = ZERO;
+	    b[ 8] = ZERO;
+	    b[ 9] = ZERO;
+	    b[10] = ZERO;
+	    b[11] = ZERO;
+	    b[12] = ZERO;
+	    b[13] = ZERO;
+	    b[14] = ZERO;
+	    b[15] = ZERO;
+	    b += 16;
+
+	    if(i >= 2) {
+	      b[ 0] = *(a02 +  0);
+	      b[ 1] = *(a02 +  1);
+#ifdef UNIT
+	      b[ 2] = ONE;
+	      b[ 3] = ZERO;
+#else
+	      b[ 2] = *(a02 +  2);
+	      b[ 3] = *(a02 +  3);
+#endif
+	      b[ 4] = ZERO;
+	      b[ 5] = ZERO;
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 3) {
+	      b[ 0] = *(a03 +  0);
+	      b[ 1] = *(a03 +  1);
+	      b[ 2] = *(a03 +  2);
+	      b[ 3] = *(a03 +  3);
+#ifdef UNIT
+	      b[ 4] = ONE;
+	      b[ 5] = ZERO;
+#else
+	      b[ 4] = *(a03 +  4);
+	      b[ 5] = *(a03 +  5);
+#endif
+	      b[ 6] = ZERO;
+	      b[ 7] = ZERO;
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 4) {
+	      b[ 0] = *(a04 +  0);
+	      b[ 1] = *(a04 +  1);
+	      b[ 2] = *(a04 +  2);
+	      b[ 3] = *(a04 +  3);
+	      b[ 4] = *(a04 +  4);
+	      b[ 5] = *(a04 +  5);
+#ifdef UNIT
+	      b[ 6] = ONE;
+	      b[ 7] = ZERO;
+#else
+	      b[ 6] = *(a04 +  6);
+	      b[ 7] = *(a04 +  7);
+#endif
+	      b[ 8] = ZERO;
+	      b[ 9] = ZERO;
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 5) {
+	      b[ 0] = *(a05 +  0);
+	      b[ 1] = *(a05 +  1);
+	      b[ 2] = *(a05 +  2);
+	      b[ 3] = *(a05 +  3);
+	      b[ 4] = *(a05 +  4);
+	      b[ 5] = *(a05 +  5);
+	      b[ 6] = *(a05 +  6);
+	      b[ 7] = *(a05 +  7);
+#ifdef UNIT
+	      b[ 8] = ONE;
+	      b[ 9] = ZERO;
+#else
+	      b[ 8] = *(a05 +  8);
+	      b[ 9] = *(a05 +  9);
+#endif
+	      b[10] = ZERO;
+	      b[11] = ZERO;
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 6) {
+	      b[ 0] = *(a06 +  0);
+	      b[ 1] = *(a06 +  1);
+	      b[ 2] = *(a06 +  2);
+	      b[ 3] = *(a06 +  3);
+	      b[ 4] = *(a06 +  4);
+	      b[ 5] = *(a06 +  5);
+	      b[ 6] = *(a06 +  6);
+	      b[ 7] = *(a06 +  7);
+	      b[ 8] = *(a06 +  8);
+	      b[ 9] = *(a06 +  9);
+#ifdef UNIT
+	      b[10] = ONE;
+	      b[11] = ZERO;
+#else
+	      b[10] = *(a06 + 10);
+	      b[11] = *(a06 + 11);
+#endif
+	      b[12] = ZERO;
+	      b[13] = ZERO;
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+
+	    if (i >= 7) {
+	      b[ 0] = *(a07 +  0);
+	      b[ 1] = *(a07 +  1);
+	      b[ 2] = *(a07 +  2);
+	      b[ 3] = *(a07 +  3);
+	      b[ 4] = *(a07 +  4);
+	      b[ 5] = *(a07 +  5);
+	      b[ 6] = *(a07 +  6);
+	      b[ 7] = *(a07 +  7);
+	      b[ 8] = *(a07 +  8);
+	      b[ 9] = *(a07 +  9);
+	      b[10] = *(a07 + 10);
+	      b[11] = *(a07 + 11);
+#ifdef UNIT
+	      b[12] = ONE;
+	      b[13] = ZERO;
+#else
+	      b[12] = *(a07 + 12);
+	      b[13] = *(a07 + 13);
+#endif
+	      b[14] = ZERO;
+	      b[15] = ZERO;
+	      b += 16;
+	    }
+	  }
+      }
+
+      posY += 8;
+  }
+
+
+  if (n & 4){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+      a02 = a + posX * 2 + (posY +  1) * lda;
+      a03 = a + posX * 2 + (posY +  2) * lda;
+      a04 = a + posX * 2 + (posY +  3) * lda;
+    } else {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+      a02 = a + posY * 2 + (posX +  1) * lda;
+      a03 = a + posY * 2 + (posX +  2) * lda;
+      a04 = a + posY * 2 + (posX +  3) * lda;
+    }
+
+    i = (m >> 2);
+    if (i > 0) {
+      do {
+	if (X < posY) {
+	  a01 += 8;
+	  a02 += 8;
+	  a03 += 8;
+	  a04 += 8;
+	  b += 32;
+	} else
+	  if (X > posY) {
+
+	    for (ii = 0; ii < 4; ii++){
+
+	      b[  0] = *(a01 +  0);
+	      b[  1] = *(a01 +  1);
+	      b[  2] = *(a01 +  2);
+	      b[  3] = *(a01 +  3);
+	      b[  4] = *(a01 +  4);
+	      b[  5] = *(a01 +  5);
+	      b[  6] = *(a01 +  6);
+	      b[  7] = *(a01 +  7);
+
+	      a01 += lda;
+	      b += 8;
+	    }
+
+	    a02 += 4 * lda;
+	    a03 += 4 * lda;
+	    a04 += 4 * lda;
+
+	  } else {
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = ZERO;
+	    b[  3] = ZERO;
+	    b[  4] = ZERO;
+	    b[  5] = ZERO;
+	    b[  6] = ZERO;
+	    b[  7] = ZERO;
+
+	    b[  8] = *(a02 +  0);
+	    b[  9] = *(a02 +  1);
+#ifdef UNIT
+	    b[ 10] = ONE;
+	    b[ 11] = ZERO;
+#else
+	    b[ 10] = *(a02 +  2);
+	    b[ 11] = *(a02 +  3);
+#endif
+	    b[ 12] = ZERO;
+	    b[ 13] = ZERO;
+	    b[ 14] = ZERO;
+	    b[ 15] = ZERO;
+
+	    b[ 16] = *(a03 +  0);
+	    b[ 17] = *(a03 +  1);
+	    b[ 18] = *(a03 +  2);
+	    b[ 19] = *(a03 +  3);
+#ifdef UNIT
+	    b[ 20] = ONE;
+	    b[ 21] = ZERO;
+#else
+	    b[ 20] = *(a03 +  4);
+	    b[ 21] = *(a03 +  5);
+#endif
+	    b[ 22] = ZERO;
+	    b[ 23] = ZERO;
+
+	    b[ 24] = *(a04 +  0);
+	    b[ 25] = *(a04 +  1);
+	    b[ 26] = *(a04 +  2);
+	    b[ 27] = *(a04 +  3);
+	    b[ 28] = *(a04 +  4);
+	    b[ 29] = *(a04 +  5);
+#ifdef UNIT
+	    b[ 30] = ONE;
+	    b[ 31] = ZERO;
+#else
+	    b[ 30] = *(a04 +  6);
+	    b[ 31] = *(a04 +  7);
+#endif
+
+	    a01 += 4 * lda;
+	    a02 += 4 * lda;
+	    a03 += 4 * lda;
+	    a04 += 4 * lda;
+	    b += 32;
+	  }
+
+	X += 4;
+	i --;
+      } while (i > 0);
+    }
+
+    i = (m & 3);
+    if (i) {
+
+      if (X < posY) {
+	/* a01 += 2 * i;
+	a02 += 2 * i;
+	a03 += 2 * i;
+	a04 += 2 * i; */
+	b += 8 * i;
+      } else
+	if (X > posY) {
+
+	  for (ii = 0; ii < i; ii++){
+
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a01 +  2);
+	    b[  3] = *(a01 +  3);
+	    b[  4] = *(a01 +  4);
+	    b[  5] = *(a01 +  5);
+	    b[  6] = *(a01 +  6);
+	    b[  7] = *(a01 +  7);
+
+	    a01 += lda;
+	    a02 += lda;
+	    a03 += lda;
+	    a04 += lda;
+	    b += 8;
+	  }
+	} else {
+
+#ifdef UNIT
+	  b[ 0] = ONE;
+	  b[ 1] = ZERO;
+#else
+	  b[ 0] = *(a01 +  0);
+	  b[ 1] = *(a01 +  1);
+#endif
+	  b[ 2] = ZERO;
+	  b[ 3] = ZERO;
+	  b[ 4] = ZERO;
+	  b[ 5] = ZERO;
+	  b[ 6] = ZERO;
+	  b[ 7] = ZERO;
+	  b += 8;
+
+	  if(i >= 2) {
+	    b[ 0] = *(a02 +  0);
+	    b[ 1] = *(a02 +  1);
+#ifdef UNIT
+	    b[ 2] = ONE;
+	    b[ 3] = ZERO;
+#else
+	    b[ 2] = *(a02 +  2);
+	    b[ 3] = *(a02 +  3);
+#endif
+	    b[ 4] = ZERO;
+	    b[ 5] = ZERO;
+	    b[ 6] = ZERO;
+	    b[ 7] = ZERO;
+	    b += 8;
+	  }
+
+	  if (i >= 3) {
+	    b[ 0] = *(a03 +  0);
+	    b[ 1] = *(a03 +  1);
+	    b[ 2] = *(a03 +  2);
+	    b[ 3] = *(a03 +  3);
+#ifdef UNIT
+	    b[ 4] = ONE;
+	    b[ 5] = ZERO;
+#else
+	    b[ 4] = *(a03 +  4);
+	    b[ 5] = *(a03 +  5);
+#endif
+	    b[ 6] = ZERO;
+	    b[ 7] = ZERO;
+	    b += 8;
+	  }
+	}
+    }
+
+    posY += 4;
+  }
+
+
+  if (n & 2){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+      a02 = a + posX * 2 + (posY +  1) * lda;
+    } else {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+      a02 = a + posY * 2 + (posX +  1) * lda;
+    }
+
+    i = (m >> 1);
+    if (i > 0) {
+      do {
+	if (X < posY) {
+	    a01 += 4;
+	    a02 += 4;
+	    b += 8;
+	} else
+	  if (X > posY) {
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+	    b[  2] = *(a01 +  2);
+	    b[  3] = *(a01 +  3);
+	    b[  4] = *(a02 +  0);
+	    b[  5] = *(a02 +  1);
+	    b[  6] = *(a02 +  2);
+	    b[  7] = *(a02 +  3);
+
+	    a01 += 2 * lda;
+	    a02 += 2 * lda;
+	    b += 8;
+	  } else {
+
+#ifdef UNIT
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+#else
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#endif
+	    b[  2] = ZERO;
+	    b[  3] = ZERO;
+
+	    b[  4] = *(a02 +  0);
+	    b[  5] = *(a02 +  1);
+#ifdef UNIT
+	    b[  6] = ONE;
+	    b[  7] = ZERO;
+#else
+	    b[  6] = *(a02 +  2);
+	    b[  7] = *(a02 +  3);
+#endif
+
+	    a01 += 2 * lda;
+	    a02 += 2 * lda;
+	    b += 8;
+	  }
+
+	X += 2;
+	i --;
+      } while (i > 0);
+    }
+
+    i = (m & 1);
+    if (i) {
+
+      if (X < posY) {
+	b += 4;
+      } else
+	if (X > posY) {
+	  b[  0] = *(a01 +  0);
+	  b[  1] = *(a01 +  1);
+	  b[  2] = *(a01 +  2);
+	  b[  3] = *(a01 +  3);
+	  b += 4;
+	  }
+#if 1
+	}
+#else
+	} else {
+#ifdef UNIT
+	  b[ 0] = ONE;
+	  b[ 1] = ZERO;
+#else
+	  b[ 0] = *(a01 +  0);
+	  b[ 1] = *(a01 +  1);
+#endif
+	  b[ 2] = *(a02 +  0);
+	  b[ 3] = *(a02 +  1);
+	  b += 4;
+	}
+#endif
+    posY += 2;
+  }
+
+  if (n & 1){
+    X = posX;
+
+    if (posX <= posY) {
+      a01 = a + posX * 2 + (posY +  0) * lda;
+    } else {
+      a01 = a + posY * 2 + (posX +  0) * lda;
+    }
+
+    i = m;
+    if (m > 0) {
+      do {
+	if (X < posY) {
+	  a01 += 2;
+	} else {
+#ifdef UNIT
+	  if (X > posY) {
+#endif
+	    b[  0] = *(a01 +  0);
+	    b[  1] = *(a01 +  1);
+#ifdef UNIT
+	  } else {
+	    b[  0] = ONE;
+	    b[  1] = ZERO;
+	  }
+#endif
+	  a01 += lda;
+	}
+	b += 2;
+	X ++;
+	i --;
+      } while (i > 0);
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrmmkernel_4x4.c b/kernel/generic/ztrmmkernel_4x4.c
old mode 100755
new mode 100644
diff --git a/kernel/generic/ztrsm_lncopy_16.c b/kernel/generic/ztrsm_lncopy_16.c
new file mode 100644
index 000000000..4fd72c13e
--- /dev/null
+++ b/kernel/generic/ztrsm_lncopy_16.c
@@ -0,0 +1,308 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj, k;
+
+  FLOAT *a1, *a2,  *a3,  *a4,  *a5,  *a6,  *a7,  *a8;
+  FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  FLOAT data1, data2;
+
+  lda *= 2;
+  jj = offset;
+
+  j = (n >> 4);
+  while (j > 0){
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a5  = a +  4 * lda;
+    a6  = a +  5 * lda;
+    a7  = a +  6 * lda;
+    a8  = a +  7 * lda;
+    a9  = a +  8 * lda;
+    a10 = a +  9 * lda;
+    a11 = a + 10 * lda;
+    a12 = a + 11 * lda;
+    a13 = a + 12 * lda;
+    a14 = a + 13 * lda;
+    a15 = a + 14 * lda;
+    a16 = a + 15 * lda;
+
+    a += 16 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 16)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 16) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+	*(b +  8) = *(a5  +  0);
+	*(b +  9) = *(a5  +  1);
+	*(b + 10) = *(a6  +  0);
+	*(b + 11) = *(a6  +  1);
+	*(b + 12) = *(a7  +  0);
+	*(b + 13) = *(a7  +  1);
+	*(b + 14) = *(a8  +  0);
+	*(b + 15) = *(a8  +  1);
+  *(b + 16) = *(a9  +  0);
+	*(b + 17) = *(a9  +  1);
+	*(b + 18) = *(a10  +  0);
+	*(b + 19) = *(a10  +  1);
+	*(b + 20) = *(a11  +  0);
+	*(b + 21) = *(a11  +  1);
+	*(b + 22) = *(a12  +  0);
+	*(b + 23) = *(a12  +  1);
+	*(b + 24) = *(a13  +  0);
+	*(b + 25) = *(a13  +  1);
+	*(b + 26) = *(a14  +  0);
+	*(b + 27) = *(a14  +  1);
+	*(b + 28) = *(a15  +  0);
+	*(b + 29) = *(a15  +  1);
+	*(b + 30) = *(a16  +  0);
+	*(b + 31) = *(a16  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      a5  += 2;
+      a6  += 2;
+      a7  += 2;
+      a8  += 2;
+      a9  += 2;
+      a10  += 2;
+      a11  += 2;
+      a12  += 2;
+      a13  += 2;
+      a14  += 2;
+      a15  += 2;
+      a16  += 2;
+      b  += 32;
+      ii ++;
+    }
+
+    jj += 16;
+    j --;
+  }
+
+  if (n & 8) {
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a5  = a +  4 * lda;
+    a6  = a +  5 * lda;
+    a7  = a +  6 * lda;
+    a8  = a +  7 * lda;
+
+    a += 8 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 8)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 8) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+	*(b +  8) = *(a5  +  0);
+	*(b +  9) = *(a5  +  1);
+	*(b + 10) = *(a6  +  0);
+	*(b + 11) = *(a6  +  1);
+	*(b + 12) = *(a7  +  0);
+	*(b + 13) = *(a7  +  1);
+	*(b + 14) = *(a8  +  0);
+	*(b + 15) = *(a8  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      a5  += 2;
+      a6  += 2;
+      a7  += 2;
+      a8  += 2;
+      b  += 16;
+      ii ++;
+    }
+
+    jj += 8;
+  }
+
+  if (n & 4) {
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a += 4 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 4)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 4) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      b  += 8;
+      ii ++;
+    }
+
+    jj += 4;
+  }
+
+  if (n & 2) {
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a += 2 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 2)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 2) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      b  += 4;
+      ii ++;
+    }
+
+    jj += 2;
+  }
+
+  if (n & 1) {
+
+    a1  = a +  0 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 1)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 1) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+      }
+
+      a1  += 2;
+      b  += 2;
+      ii ++;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrsm_ltcopy_16.c b/kernel/generic/ztrsm_ltcopy_16.c
new file mode 100644
index 000000000..e9aeae1ad
--- /dev/null
+++ b/kernel/generic/ztrsm_ltcopy_16.c
@@ -0,0 +1,264 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj, k;
+
+  FLOAT *a1;
+  FLOAT data1, data2;
+
+  lda *= 2;
+  jj = offset;
+
+  j = (n >> 4);
+  while (j > 0){
+
+    a1 = a;
+    a += 32;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 16)) {
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 16; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+	*(b +  8) = *(a1 +  8);
+	*(b +  9) = *(a1 +  9);
+	*(b + 10) = *(a1 + 10);
+	*(b + 11) = *(a1 + 11);
+	*(b + 12) = *(a1 + 12);
+	*(b + 13) = *(a1 + 13);
+	*(b + 14) = *(a1 + 14);
+	*(b + 15) = *(a1 + 15);
+  *(b + 16) = *(a1 + 16);
+	*(b + 17) = *(a1 + 17);
+	*(b + 18) = *(a1 + 18);
+	*(b + 19) = *(a1 + 19);
+	*(b + 20) = *(a1 + 20);
+	*(b + 21) = *(a1 + 21);
+	*(b + 22) = *(a1 + 22);
+	*(b + 23) = *(a1 + 23);
+	*(b + 24) = *(a1 + 24);
+	*(b + 25) = *(a1 + 25);
+	*(b + 26) = *(a1 + 26);
+	*(b + 27) = *(a1 + 27);
+	*(b + 28) = *(a1 + 28);
+	*(b + 29) = *(a1 + 29);
+	*(b + 30) = *(a1 + 30);
+	*(b + 31) = *(a1 + 31);
+      }
+
+      b  += 32;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 16;
+    j --;
+  }
+
+  j = (n & 8);
+  if (j > 0) {
+    a1 = a;
+    a += 16;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 8)) {
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 8; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+	*(b +  8) = *(a1 +  8);
+	*(b +  9) = *(a1 +  9);
+	*(b + 10) = *(a1 + 10);
+	*(b + 11) = *(a1 + 11);
+	*(b + 12) = *(a1 + 12);
+	*(b + 13) = *(a1 + 13);
+	*(b + 14) = *(a1 + 14);
+	*(b + 15) = *(a1 + 15);
+      }
+
+      b  += 16;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 8;
+  }
+
+  j = (n & 4);
+  if (j > 0) {
+
+    a1 = a;
+    a += 8;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 4)) {
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 4; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+      }
+
+      b  += 8;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 4;
+  }
+
+  j = (n & 2);
+  if (j > 0) {
+
+    a1 = a;
+    a += 4;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 2)) {
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 2; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+      }
+
+      b  += 4;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 2;
+  }
+
+  j = (n & 1);
+  if (j > 0) {
+
+    a1 = a;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 1)) {
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+      }
+
+      b  += 2;
+      a1 += lda;
+      ii ++;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrsm_uncopy_16.c b/kernel/generic/ztrsm_uncopy_16.c
new file mode 100644
index 000000000..e84d96891
--- /dev/null
+++ b/kernel/generic/ztrsm_uncopy_16.c
@@ -0,0 +1,313 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj, k;
+
+  FLOAT *a1, *a2,  *a3,  *a4,  *a5,  *a6,  *a7,  *a8;
+  FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
+
+  FLOAT data1, data2;
+
+  lda *= 2;
+  jj = offset;
+
+  j = (n >> 4);
+  while (j > 0){
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a5  = a +  4 * lda;
+    a6  = a +  5 * lda;
+    a7  = a +  6 * lda;
+    a8  = a +  7 * lda;
+    a9  = a +  8 * lda;
+    a10 = a +  9 * lda;
+    a11 = a + 10 * lda;
+    a12 = a + 11 * lda;
+    a13 = a + 12 * lda;
+    a14 = a + 13 * lda;
+    a15 = a + 14 * lda;
+    a16 = a + 15 * lda;
+
+    a += 16 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 16)) {
+
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 16; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+	*(b +  8) = *(a5  +  0);
+	*(b +  9) = *(a5  +  1);
+	*(b + 10) = *(a6  +  0);
+	*(b + 11) = *(a6  +  1);
+	*(b + 12) = *(a7  +  0);
+	*(b + 13) = *(a7  +  1);
+	*(b + 14) = *(a8  +  0);
+	*(b + 15) = *(a8  +  1);
+  *(b + 16) = *(a9  +  0);
+	*(b + 17) = *(a9  +  1);
+	*(b + 18) = *(a10  +  0);
+	*(b + 19) = *(a10  +  1);
+	*(b + 20) = *(a11  +  0);
+	*(b + 21) = *(a11  +  1);
+	*(b + 22) = *(a12  +  0);
+	*(b + 23) = *(a12  +  1);
+	*(b + 24) = *(a13  +  0);
+	*(b + 25) = *(a13  +  1);
+	*(b + 26) = *(a14  +  0);
+	*(b + 27) = *(a14  +  1);
+	*(b + 28) = *(a15  +  0);
+	*(b + 29) = *(a15  +  1);
+	*(b + 30) = *(a16  +  0);
+	*(b + 31) = *(a16  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      a5  += 2;
+      a6  += 2;
+      a7  += 2;
+      a8  += 2;
+      a9  += 2;
+      a10  += 2;
+      a11  += 2;
+      a12  += 2;
+      a13  += 2;
+      a14  += 2;
+      a15  += 2;
+      a16  += 2;
+      b  += 32;
+      ii ++;
+    }
+
+    jj += 16;
+    j --;
+  }
+
+  if (n & 8) {
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a5  = a +  4 * lda;
+    a6  = a +  5 * lda;
+    a7  = a +  6 * lda;
+    a8  = a +  7 * lda;
+
+    a += 8 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 8)) {
+
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 8; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+    }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+	*(b +  8) = *(a5  +  0);
+	*(b +  9) = *(a5  +  1);
+	*(b + 10) = *(a6  +  0);
+	*(b + 11) = *(a6  +  1);
+	*(b + 12) = *(a7  +  0);
+	*(b + 13) = *(a7  +  1);
+	*(b + 14) = *(a8  +  0);
+	*(b + 15) = *(a8  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      a5  += 2;
+      a6  += 2;
+      a7  += 2;
+      a8  += 2;
+      b  += 16;
+      ii ++;
+    }
+
+    jj += 8;
+  }
+
+  if (n & 4) {
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a3  = a +  2 * lda;
+    a4  = a +  3 * lda;
+    a += 4 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 4)) {
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+
+	for (k = ii - jj + 1; k < 4; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+	*(b +  4) = *(a3  +  0);
+	*(b +  5) = *(a3  +  1);
+	*(b +  6) = *(a4  +  0);
+	*(b +  7) = *(a4  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      a3  += 2;
+      a4  += 2;
+      b  += 8;
+      ii ++;
+    }
+
+    jj += 4;
+  }
+
+  if (n & 2) {
+
+    a1  = a +  0 * lda;
+    a2  = a +  1 * lda;
+    a += 2 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 2)) {
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+	for (k = ii - jj + 1; k < 2; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+	*(b +  2) = *(a2  +  0);
+	*(b +  3) = *(a2  +  1);
+      }
+
+      a1  += 2;
+      a2  += 2;
+      b  += 4;
+      ii ++;
+    }
+
+    jj += 2;
+  }
+
+  if (n & 1) {
+
+    a1  = a +  0 * lda;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 1)) {
+	data1 = *(a1 + (ii - jj) * lda + 0);
+	data2 = *(a1 + (ii - jj) * lda + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+	for (k = ii - jj + 1; k < 1; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * lda + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * lda + 1);
+	}
+      }
+
+      if (ii - jj < 0) {
+	*(b +  0) = *(a1  +  0);
+	*(b +  1) = *(a1  +  1);
+      }
+
+      a1  += 2;
+      b  += 2;
+      ii ++;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c
index 08f85e891..5833a64ef 100644
--- a/kernel/generic/ztrsm_utcopy_1.c
+++ b/kernel/generic/ztrsm_utcopy_1.c
@@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
 
   BLASLONG i, ii, j, jj;
 
-  FLOAT data01, data02;
+  FLOAT data01=0.0, data02=0.0;
   FLOAT *a1;
 
   lda *= 2;
diff --git a/kernel/generic/ztrsm_utcopy_16.c b/kernel/generic/ztrsm_utcopy_16.c
new file mode 100644
index 000000000..efcea5c3f
--- /dev/null
+++ b/kernel/generic/ztrsm_utcopy_16.c
@@ -0,0 +1,261 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+  BLASLONG i, ii, j, jj, k;
+
+  FLOAT *a1, data1, data2;
+
+  lda *= 2;
+
+  jj = offset;
+
+  j = (n >> 4);
+  while (j > 0){
+
+    a1 = a;
+    a += 32;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 16)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 16) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+	*(b +  8) = *(a1 +  8);
+	*(b +  9) = *(a1 +  9);
+	*(b + 10) = *(a1 + 10);
+	*(b + 11) = *(a1 + 11);
+	*(b + 12) = *(a1 + 12);
+	*(b + 13) = *(a1 + 13);
+	*(b + 14) = *(a1 + 14);
+	*(b + 15) = *(a1 + 15);
+  *(b + 16) = *(a1 + 16);
+	*(b + 17) = *(a1 + 17);
+	*(b + 18) = *(a1 + 18);
+	*(b + 19) = *(a1 + 19);
+	*(b + 20) = *(a1 + 20);
+	*(b + 21) = *(a1 + 21);
+	*(b + 22) = *(a1 + 22);
+	*(b + 23) = *(a1 + 23);
+	*(b + 24) = *(a1 + 24);
+	*(b + 25) = *(a1 + 25);
+	*(b + 26) = *(a1 + 26);
+	*(b + 27) = *(a1 + 27);
+	*(b + 28) = *(a1 + 28);
+	*(b + 29) = *(a1 + 29);
+	*(b + 30) = *(a1 + 30);
+	*(b + 31) = *(a1 + 31);
+      }
+
+      b  += 32;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 16;
+    j --;
+  }
+
+  j = (n & 8);
+  if (j > 0) {
+    a1 = a;
+    a += 16;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 8)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 8) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+	*(b +  8) = *(a1 +  8);
+	*(b +  9) = *(a1 +  9);
+	*(b + 10) = *(a1 + 10);
+	*(b + 11) = *(a1 + 11);
+	*(b + 12) = *(a1 + 12);
+	*(b + 13) = *(a1 + 13);
+	*(b + 14) = *(a1 + 14);
+	*(b + 15) = *(a1 + 15);
+      }
+
+      b  += 16;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 8;
+  }
+
+  j = (n & 4);
+  if (j > 0) {
+
+    a1 = a;
+    a += 8;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 4)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 4) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+	*(b +  4) = *(a1 +  4);
+	*(b +  5) = *(a1 +  5);
+	*(b +  6) = *(a1 +  6);
+	*(b +  7) = *(a1 +  7);
+      }
+
+      b  += 8;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 4;
+  }
+
+  j = (n & 2);
+  if (j > 0) {
+
+    a1 = a;
+    a += 4;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 2)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 2) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+	*(b +  2) = *(a1 +  2);
+	*(b +  3) = *(a1 +  3);
+      }
+
+      b  += 4;
+      a1 += lda;
+      ii ++;
+    }
+
+    jj += 2;
+  }
+
+  j = (n & 1);
+  if (j > 0) {
+
+    a1 = a;
+    ii = 0;
+
+    for (i = 0; i < m; i++) {
+
+      if ((ii >= jj ) && (ii - jj < 1)) {
+	for (k = 0; k < ii - jj; k ++) {
+	  *(b +  k * 2 + 0) = *(a1 +  k * 2 + 0);
+	  *(b +  k * 2 + 1) = *(a1 +  k * 2 + 1);
+	}
+
+	data1 = *(a1 + (ii - jj) * 2 + 0);
+	data2 = *(a1 + (ii - jj) * 2 + 1);
+
+	compinv(b +  (ii - jj) * 2, data1, data2);
+      }
+
+      if (ii - jj >= 1) {
+	*(b +  0) = *(a1 +  0);
+	*(b +  1) = *(a1 +  1);
+      }
+
+      b  += 2;
+      a1 += lda;
+      ii ++;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c
index 387bb2532..bc495f7c6 100644
--- a/kernel/generic/ztrsm_utcopy_2.c
+++ b/kernel/generic/ztrsm_utcopy_2.c
@@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
   FLOAT data05, data06, data07, data08;
   FLOAT *a1, *a2;
 
+  data01=data02=data07=data08=0.0;
   lda *= 2;
 
   jj = offset;
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
new file mode 100644
index 000000000..5b54a2ada
--- /dev/null
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -0,0 +1,140 @@
+ifndef NO_LSX
+
+SDOTKERNEL  = dot_lsx.S
+DSDOTKERNEL = dot_lsx.S
+DDOTKERNEL  = dot_lsx.S
+CDOTKERNEL  = cdot_lsx.S
+ZDOTKERNEL  = cdot_lsx.S
+
+SSCALKERNEL  = scal_lsx.S
+DSCALKERNEL  = scal_lsx.S
+CSCALKERNEL  = cscal_lsx.S
+ZSCALKERNEL  = cscal_lsx.S
+
+SAMAXKERNEL =  amax_lsx.S
+DAMAXKERNEL =  amax_lsx.S
+CAMAXKERNEL =  camax_lsx.S
+ZAMAXKERNEL =  camax_lsx.S
+
+SAMINKERNEL =  amin_lsx.S
+DAMINKERNEL =  amin_lsx.S
+CAMINKERNEL =  camin_lsx.S
+ZAMINKERNEL =  camin_lsx.S
+
+SMAXKERNEL  =  max_lsx.S
+DMAXKERNEL  =  max_lsx.S
+
+SMINKERNEL  =  min_lsx.S
+DMINKERNEL  =  min_lsx.S
+
+ISMAXKERNEL =  imax_lsx.S
+IDMAXKERNEL =  imax_lsx.S
+
+ISMINKERNEL =  imin_lsx.S
+IDMINKERNEL =  imin_lsx.S
+
+ISAMAXKERNEL = iamax_lsx.S
+IDAMAXKERNEL = iamax_lsx.S
+ICAMAXKERNEL = icamax_lsx.S
+IZAMAXKERNEL = icamax_lsx.S
+
+ISAMINKERNEL = iamin_lsx.S
+IDAMINKERNEL = iamin_lsx.S
+ICAMINKERNEL = icamin_lsx.S
+IZAMINKERNEL = icamin_lsx.S
+
+SCOPYKERNEL =  copy_lsx.S
+DCOPYKERNEL =  copy_lsx.S
+CCOPYKERNEL =  ccopy_lsx.S
+ZCOPYKERNEL =  ccopy_lsx.S
+
+SSWAPKERNEL =  swap_lsx.S
+DSWAPKERNEL =  swap_lsx.S
+
+SAXPYKERNEL =  axpy_lsx.S
+DAXPYKERNEL =  axpy_lsx.S
+CAXPYKERNEL =  caxpy_lsx.S
+ZAXPYKERNEL =  caxpy_lsx.S
+
+SAXPBYKERNEL = axpby_lsx.S
+DAXPBYKERNEL = axpby_lsx.S
+CAXPBYKERNEL = caxpby_lsx.S
+ZAXPBYKERNEL = caxpby_lsx.S
+
+SSUMKERNEL  =  sum_lsx.S
+DSUMKERNEL  =  sum_lsx.S
+
+SASUMKERNEL =  asum_lsx.S
+DASUMKERNEL =  asum_lsx.S
+CASUMKERNEL =  casum_lsx.S
+ZASUMKERNEL =  casum_lsx.S
+
+SROTKERNEL  =  rot_lsx.S
+DROTKERNEL  =  rot_lsx.S
+CROTKERNEL  =  crot_lsx.S
+ZROTKERNEL  =  crot_lsx.S
+
+SNRM2KERNEL =  snrm2_lsx.S
+DNRM2KERNEL =  dnrm2_lsx.S
+CNRM2KERNEL =  cnrm2_lsx.S
+ZNRM2KERNEL =  znrm2_lsx.S
+
+CSWAPKERNEL = cswap_lsx.S
+ZSWAPKERNEL = cswap_lsx.S
+
+CSUMKERNEL = csum_lsx.S
+ZSUMKERNEL = csum_lsx.S
+
+SGEMVNKERNEL = sgemv_n_lsx.S
+SGEMVTKERNEL = sgemv_t_lsx.S
+
+DGEMVNKERNEL = dgemv_n_lsx.S
+DGEMVTKERNEL = dgemv_t_lsx.S
+
+DGEMMKERNEL    = dgemm_kernel_8x4.S
+DGEMMINCOPY    = dgemm_ncopy_8_lsx.S
+DGEMMITCOPY    = dgemm_tcopy_8_lsx.S
+DGEMMONCOPY    = dgemm_ncopy_4_lsx.S
+DGEMMOTCOPY    = dgemm_tcopy_4_lsx.S
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+CGEMVNKERNEL = cgemv_n_4_lsx.S
+CGEMVTKERNEL = cgemv_t_4_lsx.S
+
+CGEMMKERNEL  = cgemm_kernel_8x4_lsx.S
+CGEMMINCOPY  = cgemm_ncopy_8_lsx.S
+CGEMMITCOPY  = cgemm_tcopy_8_lsx.S
+CGEMMONCOPY  = cgemm_ncopy_4_lsx.S
+CGEMMOTCOPY  = cgemm_tcopy_4_lsx.S
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZGEMVNKERNEL = zgemv_n_2_lsx.S
+ZGEMVTKERNEL = zgemv_t_2_lsx.S
+
+ZGEMMKERNEL  = zgemm_kernel_4x4_lsx.S
+ZGEMMONCOPY  = zgemm_ncopy_4_lsx.S
+ZGEMMOTCOPY  = zgemm_tcopy_4_lsx.S
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index cda359040..9b55d1bbb 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -1,3 +1,90 @@
+ifndef NO_LASX
+
+SDOTKERNEL  = dot_lasx.S
+DSDOTKERNEL = dot_lasx.S
+DDOTKERNEL  = dot_lasx.S
+CDOTKERNEL  = cdot_lasx.S
+ZDOTKERNEL  = cdot_lasx.S
+
+SSCALKERNEL  = scal_lasx.S
+DSCALKERNEL  = scal_lasx.S
+CSCALKERNEL  = cscal_lasx.S
+ZSCALKERNEL  = cscal_lasx.S
+
+SAMAXKERNEL =  amax_lasx.S
+DAMAXKERNEL =  amax_lasx.S
+CAMAXKERNEL =  camax_lasx.S
+ZAMAXKERNEL =  camax_lasx.S
+
+SAMINKERNEL =  amin_lasx.S
+DAMINKERNEL =  amin_lasx.S
+CAMINKERNEL =  camin_lasx.S
+ZAMINKERNEL =  camin_lasx.S
+
+SMAXKERNEL  =  max_lsx.S
+DMAXKERNEL =   max_lsx.S
+
+SMINKERNEL =   min_lsx.S
+DMINKERNEL =   min_lsx.S
+
+ISMAXKERNEL =  imax_lasx.S
+IDMAXKERNEL =  imax_lasx.S
+
+ISMINKERNEL =  imin_lasx.S
+IDMINKERNEL =  imin_lasx.S
+
+ISAMAXKERNEL = iamax_lasx.S
+IDAMAXKERNEL = iamax_lasx.S
+ICAMAXKERNEL = icamax_lasx.S
+IZAMAXKERNEL = icamax_lasx.S
+
+ISAMINKERNEL = iamin_lasx.S
+IDAMINKERNEL = iamin_lasx.S
+ICAMINKERNEL = icamin_lasx.S
+IZAMINKERNEL = icamin_lasx.S
+
+SCOPYKERNEL =  copy_lasx.S
+DCOPYKERNEL =  copy_lasx.S
+CCOPYKERNEL =  ccopy_lasx.S
+ZCOPYKERNEL =  ccopy_lasx.S
+
+SSWAPKERNEL =  swap_lasx.S
+DSWAPKERNEL =  swap_lasx.S
+
+SAXPYKERNEL =  axpy_lasx.S
+DAXPYKERNEL =  axpy_lasx.S
+CAXPYKERNEL =  caxpy_lasx.S
+ZAXPYKERNEL =  caxpy_lasx.S
+
+SAXPBYKERNEL = axpby_lasx.S
+DAXPBYKERNEL = axpby_lasx.S
+CAXPBYKERNEL = caxpby_lasx.S
+ZAXPBYKERNEL = caxpby_lasx.S
+
+SSUMKERNEL  =  sum_lasx.S
+DSUMKERNEL  =  sum_lasx.S
+
+SASUMKERNEL =  asum_lasx.S
+DASUMKERNEL =  asum_lasx.S
+CASUMKERNEL =  casum_lasx.S
+ZASUMKERNEL =  casum_lasx.S
+
+SROTKERNEL  =  rot_lasx.S
+DROTKERNEL  =  rot_lasx.S
+CROTKERNEL  =  crot_lasx.S
+ZROTKERNEL  =  crot_lasx.S
+
+SNRM2KERNEL =  snrm2_lasx.S
+DNRM2KERNEL =  dnrm2_lasx.S
+CNRM2KERNEL =  cnrm2_lasx.S
+ZNRM2KERNEL =  znrm2_lasx.S
+
+CSWAPKERNEL = cswap_lasx.S
+ZSWAPKERNEL = cswap_lasx.S
+
+CSUMKERNEL = csum_lasx.S
+ZSUMKERNEL = csum_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
@@ -8,7 +95,65 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+DGEMVNKERNEL = dgemv_n_8_lasx.S
+DGEMVTKERNEL = dgemv_t_8_lasx.S
+
+SGEMMKERNEL  = sgemm_kernel_16x8_lasx.S
+SGEMMINCOPY  = sgemm_ncopy_16_lasx.S
+SGEMMITCOPY  = sgemm_tcopy_16_lasx.S
+SGEMMONCOPY  = sgemm_ncopy_8_lasx.S
+SGEMMOTCOPY  = sgemm_tcopy_8_lasx.S
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+SGEMVNKERNEL = sgemv_n_8_lasx.S
+SGEMVTKERNEL = sgemv_t_8_lasx.S
+
+CGEMMKERNEL  = cgemm_kernel_16x4_lasx.S
+CGEMMINCOPY  = cgemm_ncopy_16_lasx.S
+CGEMMITCOPY  = cgemm_tcopy_16_lasx.S
+CGEMMONCOPY  = cgemm_ncopy_4_lasx.S
+CGEMMOTCOPY  = cgemm_tcopy_4_lasx.S
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMVNKERNEL = cgemv_n_8_lasx.S
+CGEMVTKERNEL = cgemv_t_8_lasx.S
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZGEMMKERNEL  = zgemm_kernel_8x4_lasx.S
+ZGEMMINCOPY  = zgemm_ncopy_8_lasx.S
+ZGEMMITCOPY  = zgemm_tcopy_8_lasx.S
+ZGEMMONCOPY  = zgemm_ncopy_4_lasx.S
+ZGEMMOTCOPY  = zgemm_tcopy_4_lasx.S
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMVNKERNEL = zgemv_n_4_lasx.S
+ZGEMVTKERNEL = zgemv_t_4_lasx.S
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN  = dtrsm_kernel_LN_16x4_lasx.S
+DTRSMKERNEL_LT  = dtrsm_kernel_LT_16x4_lasx.S
+DTRSMKERNEL_RN  = dtrsm_kernel_RN_16x4_lasx.S
+DTRSMKERNEL_RT  = dtrsm_kernel_RT_16x4_lasx.S
+
+STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+endif
diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic
index b772a6f82..213add9ee 100644
--- a/kernel/loongarch64/KERNEL.generic
+++ b/kernel/loongarch64/KERNEL.generic
@@ -132,12 +132,16 @@ CSWAPKERNEL  = ../arm/zswap.c
 ZSWAPKERNEL  = ../arm/zswap.c
 
 SGEMVNKERNEL = ../arm/gemv_n.c
+ifndef DGEMVNKERNEL
 DGEMVNKERNEL = ../arm/gemv_n.c
+endif
 CGEMVNKERNEL = ../arm/zgemv_n.c
 ZGEMVNKERNEL = ../arm/zgemv_n.c
 
 SGEMVTKERNEL = ../arm/gemv_t.c
+ifndef DGEMVTKERNEL
 DGEMVTKERNEL = ../arm/gemv_t.c
+endif
 CGEMVTKERNEL = ../arm/zgemv_t.c
 ZGEMVTKERNEL = ../arm/zgemv_t.c
 
diff --git a/kernel/loongarch64/amax_lasx.S b/kernel/loongarch64/amax_lasx.S
new file mode 100644
index 000000000..e964d4ddb
--- /dev/null
+++ b/kernel/loongarch64/amax_lasx.S
@@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    XVFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM0, VM1
+    XVFMAXA   VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAXA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMAXA   VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAXA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMAXA   VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/amax_lsx.S b/kernel/loongarch64/amax_lsx.S
new file mode 100644
index 000000000..fb3b77a0e
--- /dev/null
+++ b/kernel/loongarch64/amax_lsx.S
@@ -0,0 +1,231 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    VFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM0, VM1
+    VFMAXA   VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAXA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAXA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/amin_lasx.S b/kernel/loongarch64/amin_lasx.S
new file mode 100644
index 000000000..c91a33006
--- /dev/null
+++ b/kernel/loongarch64/amin_lasx.S
@@ -0,0 +1,231 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM0, VM1
+    XVFMINA   VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMINA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMINA   VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMINA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMINA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMINA   VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMINA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/amin_lsx.S b/kernel/loongarch64/amin_lsx.S
new file mode 100644
index 000000000..47701b6e4
--- /dev/null
+++ b/kernel/loongarch64/amin_lsx.S
@@ -0,0 +1,230 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM0, VM1
+    VFMINA   VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMINA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMINA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMINA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMINA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/asum_lasx.S b/kernel/loongarch64/asum_lasx.S
new file mode 100644
index 000000000..9a2c031f3
--- /dev/null
+++ b/kernel/loongarch64/asum_lasx.S
@@ -0,0 +1,257 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define VT0    $xr23
+#define VT1    $xr22
+#define res1   $xr16
+#define res2   $xr17
+#define res0   $xr18
+#define neg1   $xr19
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    xvreplgr2vr.d neg1, t1
+    xvffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    xvreplgr2vr.w neg1, t1
+    xvffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD    $f12, X, 0 * SIZE
+    FABS  $f12, $f12
+    ADD   $f16, $f12,  $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV    $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/asum_lsx.S b/kernel/loongarch64/asum_lsx.S
new file mode 100644
index 000000000..512b01404
--- /dev/null
+++ b/kernel/loongarch64/asum_lsx.S
@@ -0,0 +1,258 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define VT0    $vr23
+#define VT1    $vr22
+#define res1   $vr16
+#define res2   $vr17
+#define res0   $vr18
+#define neg1   $vr19
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    vreplgr2vr.d neg1, t1
+    vffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    vreplgr2vr.w neg1, t1
+    vffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV    $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/axpby_lasx.S b/kernel/loongarch64/axpby_lasx.S
new file mode 100644
index 000000000..7a246ca5c
--- /dev/null
+++ b/kernel/loongarch64/axpby_lasx.S
@@ -0,0 +1,1055 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHA  $f0
+#define X      $r5
+#define INCX   $r6
+#define BETA   $f1
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXA    $xr23
+#define VXB    $xr9
+#define VXZ    $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    MTG  t1, ALPHA
+    MTG  t2, BETA
+    MTG  t3, a1
+#ifdef DOUBLE
+    xvreplgr2vr.d VXA, t1
+    xvreplgr2vr.d VXB, t2
+    xvreplgr2vr.d VXZ, t3
+#else
+    xvreplgr2vr.w VXA, t1
+    xvreplgr2vr.w VXB, t2
+    xvreplgr2vr.w VXZ, t3
+#endif
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
+    b .L111 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
+    b .L113 // ALPHA==0 BETA!=0
+    .align 3
+
+.L111: // ALPHA!=0 BETA!=0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VX0, VX0, VXA
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX2, VX2, VXB, VX0
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // ALPHA!=0 BETA==0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX0, VX0, VXA
+    xvfmul.d VX1, VX1, VXA
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+#else
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // ALPHA==0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VX2, VX2, VXB
+    xvfmul.d VX3, VX3, VXB
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvfmul.s VX2, VX2, VXB
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // ALPHA==0 BETA==0
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L120
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
+    b .L121 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L120:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
+    b .L123 // ALPHA==0 BETA!=0
+    .align 3
+
+.L121: // ALPHA!=0 BETA!=0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, VX0, VXA
+    xvld VX1, X, 4 * SIZE
+    xvfmadd.d VX2, VX2, VXB, VX0
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // ALPHA!=0 BETA==0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX0, VX0, VXA
+    xvfmul.d VX1, VX1, VXA
+    xvstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+#else
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX2, VX2, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX3, VX3, VXB
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // ALPHA==0 BETA==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L210
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
+    b .L211 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L210:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
+    b .L213 // ALPHA==0 BETA!=0
+    .align 3
+
+.L211: // ALPHA!=0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX0, VXA, VX0
+    xvfmadd.d VX2, VX2, VXB, VX0
+    xvld VX3, Y, 4 * SIZE
+    xvst VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvst VX3, Y, 4 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX0, VXA, VX0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvst VX0, Y, 0 * SIZE
+    xvfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    xvst VX1, Y, 4 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // ALPHA==0 BETA!=0
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX3, Y, 4 * SIZE
+    xvfmul.d VX2, VX2, VXB
+    xvfmul.d VX3, VX3, VXB
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvfmul.s VX2, VX2, VXB
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // ALPHA==0 BETA==0
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L214
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
+    b .L221 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
+    b .L223 // ALPHA==0 BETA!=0
+    .align 3
+
+.L221: // ALPHA!=0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, VX0, VXA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    xvfmadd.d VX2, VX2, VXB, VX0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX1, VX1, VXA
+    xvfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VX0, VX0, VXA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    xvstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX2, VX2, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // ALPHA==0 BETA==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    addi.d I, I, -1
+    MUL  $f12, $f12, ALPHA
+    MADD $f13, $f13, BETA, $f12
+    ST   $f13, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/axpby_lsx.S b/kernel/loongarch64/axpby_lsx.S
new file mode 100644
index 000000000..e50d4cdcc
--- /dev/null
+++ b/kernel/loongarch64/axpby_lsx.S
@@ -0,0 +1,1153 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHA  $f0
+#define X      $r5
+#define INCX   $r6
+#define BETA   $f1
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXA    $vr23
+#define VXB    $vr9
+#define VXZ    $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    movgr2fr.d a1, $r0
+    ffint.s.l a1, a1
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    MTG  t1, ALPHA
+    MTG  t2, BETA
+    MTG  t3, a1
+#ifdef DOUBLE
+    vreplgr2vr.d VXA, t1
+    vreplgr2vr.d VXB, t2
+    vreplgr2vr.d VXZ, t3
+#else
+    vreplgr2vr.w VXA, t1
+    vreplgr2vr.w VXB, t2
+    vreplgr2vr.w VXZ, t3
+#endif
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
+    b .L111 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
+    b .L113 // ALPHA==0 BETA!=0
+    .align 3
+
+.L111: // ALPHA!=0 BETA!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    vfmadd.d VX3, VX3, VXB, VX1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    vfmadd.d VX3, VX3, VXB, VX1
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX2, VX2, VXB, VX0
+    vfmadd.s VX3, VX3, VXB, VX1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // ALPHA!=0 BETA==0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfmul.d VX2, VX2, VXA
+    vfmul.d VX3, VX3, VXA
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    vld VX0, Y, 0 * SIZE
+    vld VX1, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXB
+    vfmul.d VX1, VX1, VXB
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VX2, VX2, VXB
+    vfmul.d VX3, VX3, VXB
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // ALPHA==0 BETA==0
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, Y, 6 * SIZE
+#else
+    vst VXZ, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L120
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
+    b .L121 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L120:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
+    b .L123 // ALPHA==0 BETA!=0
+    .align 3
+
+.L121: // ALPHA!=0 BETA!=0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vfmul.d VX0, VX0, VXA
+    vld VX1, X, 2 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    vld VX0, X, 4 * SIZE
+    vfmadd.d VX3, VX3, VXB, VX1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX0, VX0, VXA
+    vld VX1, X, 6 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    vld VX1, X, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // ALPHA!=0 BETA==0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vfmul.d VX1, VX1, VXA
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // ALPHA==0 BETA==0
+#ifdef DOUBLE
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L124
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L210
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
+    b .L211 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L210:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
+    b .L213 // ALPHA==0 BETA!=0
+    .align 3
+
+.L211: // ALPHA!=0 BETA!=0
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VXA, VX0
+    vld VX3, Y, 2 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX2, Y, 0 * SIZE
+    vfmul.d VX1, VXA, VX1
+    vld VX2, Y, 4 * SIZE
+    vfmadd.d VX3, VX3, VXB, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vst VX3, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    vld VX3, Y, 6 * SIZE
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX2, Y, 4 * SIZE
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vst VX3, Y, 6 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX2, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VXA, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0 * SIZE
+    vfmul.d VX1, VXA, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vst VX1, Y, 2 * SIZE
+    vfmul.d VX0, VX0, VXA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 4 * SIZE
+    vfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    vst VX1, Y, 6 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX0, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // ALPHA==0 BETA!=0
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX3, Y, 2 * SIZE
+    vfmul.d VX2, VX2, VXB
+    vfmul.d VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfmul.d VX2, VX2, VXB
+    vfmul.d VX3, VX3, VXB
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // ALPHA==0 BETA==0
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, Y, 6 * SIZE
+#else
+    vst VXZ, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L214
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, ALPHA, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
+    b .L221 // ALPHA!=0 BETA!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, BETA, a1
+    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
+    b .L223 // ALPHA==0 BETA!=0
+    .align 3
+
+.L221: // ALPHA!=0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VX0, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VX0, VXA
+    vfmadd.d VX2, VX2, VXB, VX0
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VX1, VX1, VXA
+    vfmadd.d VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vfmadd.s VX2, VX2, VXB, VX0
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vfmadd.s VX3, VX3, VXB, VX1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VX0, VXA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX0, VX0, VXA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX1, VX1, VXA
+    addi.d  I, I, -1
+    vstelm.d VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX2, VX2, VXB
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vfmul.d VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // ALPHA==0 BETA==0
+#ifdef DOUBLE
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    addi.d I, I, -1
+    MUL  $f12, $f12, ALPHA
+    MADD $f13, $f13, BETA, $f12
+    ST   $f13, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/axpy_lasx.S b/kernel/loongarch64/axpy_lasx.S
new file mode 100644
index 000000000..707fd09b5
--- /dev/null
+++ b/kernel/loongarch64/axpy_lasx.S
@@ -0,0 +1,529 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHA  $f0
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXA    $xr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT   a1,  a1
+    movgr2fr.d a2, TEMP
+    FFINT   a2,  a2
+    CMPEQ   $fcc0, ALPHA, a1
+    bcnez $fcc0, .L999
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    MTG t1, ALPHA
+#ifdef DOUBLE
+    xvreplgr2vr.d VXA, t1
+#else
+    xvreplgr2vr.w VXA, t1
+#endif
+
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L113
+    CMPEQ $fcc0, ALPHA, a2
+    bceqz $fcc0, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfadd.d VX2, VX0, VX2
+    xvfadd.d VX3, VX1, VX3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfadd.s VX2, VX0, VX2
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    b .L113
+    .align 3
+
+.L112:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvfmadd.d VX2, VX0, VXA, VX2
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX0, VXA, VX2
+    xvst VX2, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L112
+    .align 3
+
+.L113:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L114:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L114
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX2, VX0, VXA, VX2
+    xvld VX1, X, 4 * SIZE
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmadd.d VX2, VX0, VXA, VX2
+    xvld VX3, Y, 4 * SIZE
+    xvst VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    addi.d Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX2, VX0, VXA, VX2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    xvstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/axpy_lsx.S b/kernel/loongarch64/axpy_lsx.S
new file mode 100644
index 000000000..0d74e2bce
--- /dev/null
+++ b/kernel/loongarch64/axpy_lsx.S
@@ -0,0 +1,573 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHA  $f0
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXA    $vr23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT   a1,  a1
+    movgr2fr.d a2, TEMP
+    FFINT   a2,  a2
+    CMPEQ   $fcc0, ALPHA, a1
+    bcnez $fcc0, .L999
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    MTG t1, ALPHA
+#ifdef DOUBLE
+    vreplgr2vr.d VXA, t1
+#else
+    vreplgr2vr.w VXA, t1
+#endif
+
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L113
+    CMPEQ $fcc0, ALPHA, a2
+    bceqz $fcc0, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfadd.d VX2, VX0, VX2
+    vfadd.d VX3, VX1, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vfadd.d VX2, VX0, VX2
+    vfadd.d VX3, VX1, VX3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfadd.s VX2, VX0, VX2
+    vfadd.s VX3, VX1, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L113
+    .align 3
+
+.L112:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vfmadd.d VX2, VX0, VXA, VX2
+    vfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    vfmadd.d VX2, VX0, VXA, VX2
+    vfmadd.d VX3, VX1, VXA, VX3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX0, VXA, VX2
+    vfmadd.s VX3, VX1, VXA, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+#endif
+    blt $r0, I, .L112
+    .align 3
+
+.L113:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L114:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L114
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX1, X, 2 * SIZE
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    vld VX0, X, 4 * SIZE
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX1, X, 6 * SIZE
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX1, X, 4 * SIZE
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+#endif
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    addi.d  X, X, SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX3, Y, 2 * SIZE
+    vst VX2, Y, 0 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmadd.d VX3, VX1, VXA, VX3
+    vld VX2, Y, 4 * SIZE
+    vst VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vfmadd.d VX2, VX0, VXA, VX2
+    vld VX3, Y, 6 * SIZE
+    vst VX2, Y, 4 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX3, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX3, Y, 4 * SIZE
+    vst VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    addi.d Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX2, VX0, VXA, VX2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    vfmadd.d VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.d VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camax_lasx.S b/kernel/loongarch64/camax_lasx.S
new file mode 100644
index 000000000..f9a4e9012
--- /dev/null
+++ b/kernel/loongarch64/camax_lasx.S
@@ -0,0 +1,212 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $xr9
+#define x2   $xr10
+#define x3   $xr11
+#define x4   $xr12
+#define VT0  $xr13
+#define VT1  $xr14
+#define res0 $xr18
+#define neg1 $xr19
+#define VX0  $xr20
+#define VX1  $xr21
+#define VM0  $xr22
+#define VM1  $xr23
+
+    PROLOGUE
+    xvxor.v VM0, VM0, VM0
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+#ifdef DOUBLE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMAX VM0, VM0, VM1
+#ifdef DOUBLE
+    xvld VX0, X, 64
+    xvld VX1, X, 96
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMAX VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    XVFMAX VM0, x1, x2
+#else
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    XVFMAX VM0, x1, x2
+    XVFMAX VM1, x3, x4
+    XVFMAX VM0, VM0, VM1
+#endif
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    addi.d I, I, -1
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    FMAX s1, s1, s2
+    FMAX s3, s3, s4
+    FMAX s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    FMAX s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camax_lsx.S b/kernel/loongarch64/camax_lsx.S
new file mode 100644
index 000000000..cf46cb016
--- /dev/null
+++ b/kernel/loongarch64/camax_lsx.S
@@ -0,0 +1,239 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $vr9
+#define x2   $vr10
+#define x3   $vr11
+#define x4   $vr12
+#define VT0  $vr13
+#define VT1  $vr14
+#define res0 $vr18
+#define neg1 $vr19
+#define VX0  $vr20
+#define VX1  $vr21
+#define VM0  $vr22
+#define VM1  $vr23
+
+    PROLOGUE
+    vxor.v VM0, VM0, VM0
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    vld VX0, X, 0
+    vld VX1, X, 16
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 32
+    vld VX1, X, 48
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMAX VM1, x1, VM1
+    VFMAX VM0, VM0, VM1
+#ifdef DOUBLE
+    vld VX0, X, 64
+    vld VX1, X, 80
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 96
+    vld VX1, X, 112
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMAX VM1, x1, VM1
+    VFMAX VM0, VM0, VM1
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    VFMAX VM0, x1, x2
+#else
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VFMAX VM1, x1, x2
+    VFMAX VM0, x3, x4
+    VFMAX VM0, VM0, VM1
+#endif
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    addi.d I, I, -1
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    FMAX s1, s1, s2
+    FMAX s3, s3, s4
+    FMAX s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    FMAX s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camin_lasx.S b/kernel/loongarch64/camin_lasx.S
new file mode 100644
index 000000000..c1c4c98c8
--- /dev/null
+++ b/kernel/loongarch64/camin_lasx.S
@@ -0,0 +1,221 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define TEMP $r16
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define a0   $f20
+#define a1   $f21
+#define x1   $xr9
+#define x2   $xr10
+#define x3   $xr11
+#define x4   $xr12
+#define VT0  $xr13
+#define VT1  $xr14
+#define res0 $xr18
+#define neg1 $xr19
+#define VX0  $xr20
+#define VX1  $xr21
+#define VM0  $xr22
+#define VM1  $xr23
+
+    PROLOGUE
+    MTC s1, $r0
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD s1, a1, a0
+#ifdef DOUBLE
+    xvreplve0.d VM0, VM0
+#else
+    xvreplve0.w VM0, VM0
+#endif
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+#ifdef DOUBLE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMIN VM0, VM0, VM1
+#ifdef DOUBLE
+    xvld VX0, X, 64
+    xvld VX1, X, 96
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMIN VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    XVFMIN VM0, x1, x2
+#else
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    XVFMIN VM0, x1, x2
+    XVFMIN VM1, x3, x4
+    XVFMIN VM0, VM0, VM1
+#endif
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    addi.d I, I, -1
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    FMIN s1, s1, s2
+    FMIN s3, s3, s4
+    FMIN s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    FMIN s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/camin_lsx.S b/kernel/loongarch64/camin_lsx.S
new file mode 100644
index 000000000..ff666ea8f
--- /dev/null
+++ b/kernel/loongarch64/camin_lsx.S
@@ -0,0 +1,248 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $vr9
+#define x2   $vr10
+#define x3   $vr11
+#define x4   $vr12
+#define VT0  $vr13
+#define VT1  $vr14
+#define res0 $vr18
+#define neg1 $vr19
+#define VX0  $vr20
+#define VX1  $vr21
+#define VM0  $vr22
+#define VM1  $vr23
+
+    PROLOGUE
+    MTC s1, $r0
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD s1, a1, a0
+#ifdef DOUBLE
+    vreplvei.d VM0, VM0, 0
+#else
+    vreplvei.w VM0, VM0, 0
+#endif
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    vld VX0, X, 0
+    vld VX1, X, 16
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 32
+    vld VX1, X, 48
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMIN VM1, x1, VM1
+    VFMIN VM0, VM0, VM1
+#ifdef DOUBLE
+    vld VX0, X, 64
+    vld VX1, X, 80
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 96
+    vld VX1, X, 112
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMIN VM1, x1, VM1
+    VFMIN VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    VFMIN VM0, x1, x2
+#else
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VFMIN VM1, x1, x2
+    VFMIN VM0, x3, x4
+    VFMIN VM0, VM0, VM1
+#endif
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    addi.d I, I, -1
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    FMIN s1, s1, s2
+    FMIN s3, s3, s4
+    FMIN s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    FMIN s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/casum_lasx.S b/kernel/loongarch64/casum_lasx.S
new file mode 100644
index 000000000..caf0ff969
--- /dev/null
+++ b/kernel/loongarch64/casum_lasx.S
@@ -0,0 +1,329 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+#define res3   $xr18
+#define res0   $xr19
+#define neg1   $xr20
+#define VT0    $xr21
+#define VT1    $xr22
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    xvreplgr2vr.d neg1, t1
+    xvffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    xvreplgr2vr.w neg1, t1
+    xvffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvfmul.d VX0, neg1, VX2
+    xvfmul.d VX1, neg1, VX3
+    xvfcmp.clt.d VT0, VX2, res0
+    xvfcmp.clt.d VT1, VX3, res0
+    xvbitsel.v VX2, VX2, VX0, VT0
+    xvbitsel.v VX3, VX3, VX1, VT1
+    xvfadd.d res2, VX2, VX3
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvfmul.s VX2, neg1, VX0
+    xvfmul.s VX3, neg1, VX1
+    xvfcmp.clt.s VT0, VX0, res0
+    xvfcmp.clt.s VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvfmul.s VX2, neg1, VX0
+    xvfmul.s VX3, neg1, VX1
+    xvfcmp.clt.s VT0, VX0, res0
+    xvfcmp.clt.s VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV  $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/casum_lsx.S b/kernel/loongarch64/casum_lsx.S
new file mode 100644
index 000000000..4822f2080
--- /dev/null
+++ b/kernel/loongarch64/casum_lsx.S
@@ -0,0 +1,358 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+#define res0   $vr19
+#define neg1   $vr20
+#define VT0    $vr21
+#define VT1    $vr22
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    vreplgr2vr.d neg1, t1
+    vffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    vreplgr2vr.w neg1, t1
+    vffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfmul.d VX0, neg1, VX2
+    vfmul.d VX1, neg1, VX3
+    vfcmp.clt.d VT0, VX2, res0
+    vfcmp.clt.d VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vfmul.d VX0, neg1, VX2
+    vfmul.d VX1, neg1, VX3
+    vfcmp.clt.d VT0, VX2, res0
+    vfcmp.clt.d VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    addi.d  I, I, -1
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 12 * SIZE
+    addi.d  I, I, -1
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res3, VX1, VX0
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vfmul.s VX0, neg1, VX2
+    vfmul.s VX1, neg1, VX3
+    vfcmp.clt.s VT0, VX2, res0
+    vfcmp.clt.s VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV  $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/caxpby_lasx.S b/kernel/loongarch64/caxpby_lasx.S
new file mode 100644
index 000000000..c5802092e
--- /dev/null
+++ b/kernel/loongarch64/caxpby_lasx.S
@@ -0,0 +1,1046 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r5
+#define INCX   $r6
+#define BETAR  $f2
+#define BETAI  $f3
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define VXBR   $xr14
+#define VXBI   $xr13
+#define VXZ    $xr12
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG t1, ALPHAR
+    MTG t2, ALPHAI
+    MTG t3, BETAR
+    MTG t4, BETAI
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    xvreplgr2vr.d VXAI, t2
+    xvreplgr2vr.d VXBR, t3
+    xvreplgr2vr.d VXBI, t4
+#else
+    xvreplgr2vr.w VXAR, t1
+    xvreplgr2vr.w VXAI, t2
+    xvreplgr2vr.w VXBR, t3
+    xvreplgr2vr.w VXBI, t4
+#endif
+    xvxor.v VXZ, VXZ, VXZ
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 2
+#else
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, BETAR, a1
+    CMPEQ $fcc1, BETAI, a1
+    CMPEQ $fcc2, ALPHAR, a1
+    CMPEQ $fcc3, ALPHAI, a1
+    bceqz $fcc0, .L13
+    bceqz $fcc1, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc2, .L114
+    bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+
+.L14:
+    bceqz $fcc2, .L112
+    bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L111:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvst VXZ, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFMUL x3, VXAI, x2
+    XVFMUL x4, VXAI, x1
+    XVMSUB x3, VXAR, x1, x3
+    XVFMADD x4, VXAR, x2, x4
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    xvld VX0, Y, 0 * SIZE
+    xvld VX1, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, Y, 0 * SIZE
+    xvld VX1, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFMUL x3, VXBI, x2
+    XVFMUL x4, VXBI, x1
+    XVMSUB x3, VXBR, x1, x3
+    XVFMADD x4, VXBR, x2, x4
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX1, VXAI, x1
+    xvfmul.d VX2, VXBI, x4
+    xvfmul.d VX3, VXBI, x3
+    xvfmsub.d VX0, VXAR, x1, VX0
+    xvfmadd.d VX1, VXAR, x2, VX1
+    xvfmsub.d VX2, VXBR, x3, VX2
+    xvfmadd.d VX3, VXBR, x4, VX3
+    xvfadd.d x3, VX0, VX2
+    xvfadd.d x4, VX1, VX3
+    addi.d  I, I, -1
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    xvld VX1, X, 8 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    add.d Y, Y, INCY
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#endif
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX1, VXAI, x1
+    xvfmul.d VX2, VXBI, x4
+    xvfmul.d VX3, VXBI, x3
+    xvfmsub.d VX0, VXAR, x1, VX0
+    xvfmadd.d VX1, VXAR, x2, VX1
+    xvfmsub.d VX2, VXBR, x3, VX2
+    xvfmadd.d VX3, VXBR, x4, VX3
+    xvfadd.d x3, VX0, VX2
+    xvfadd.d x4, VX1, VX3
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    xvld VX3, Y, 8 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#endif
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    CMPEQ $fcc0, BETAR, a1
+    CMPEQ $fcc1, BETAI, a1
+    CMPEQ $fcc2, ALPHAR, a1
+    CMPEQ $fcc3, ALPHAI, a1
+    bceqz $fcc0, .L23
+    bceqz $fcc1, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc2, .L224
+    bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L24:
+    bceqz $fcc2, .L222
+    bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L221:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VXZ, Y, 0, 0
+    xvstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#else
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VXZ, Y, 0, 0
+    xvstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#endif
+
+.L222:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    xvfmul.d x3, VXAI, x2
+    xvfmul.d x4, VXAI, x1
+    xvfmsub.d x3, VXAR, x1, x3
+    xvfmadd.d x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#else
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    XVFMUL x3, VXAI, x2
+    XVFMUL x4, VXAI, x1
+    XVMSUB x3, VXAR, x1, x3
+    XVFMADD x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#endif
+
+.L223:
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d x3, VXBI, x2
+    xvfmul.d x4, VXBI, x1
+    xvfmsub.d x3, VXBR, x1, x3
+    xvfmadd.d x4, VXBR, x2, x4
+
+    addi.d  I, I, -1
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#else
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d Y, Y, INCY
+
+    XVFMUL x3, VXBI, x2
+    XVFMUL x4, VXBI, x1
+    XVMSUB x3, VXBR, x1, x3
+    XVFMADD x4, VXBR, x2, x4
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#endif
+
+.L224:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX1, VXAI, x1
+    xvfmul.d VX2, VXBI, x4
+    xvfmul.d VX3, VXBI, x3
+    xvfmsub.d VX0, VXAR, x1, VX0
+    xvfmadd.d VX1, VXAR, x2, VX1
+    xvfmsub.d VX2, VXBR, x3, VX2
+    xvfmadd.d VX3, VXBR, x4, VX3
+    xvfadd.d x3, VX0, VX2
+    xvfadd.d x4, VX1, VX3
+    addi.d  I, I, -1
+
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#else
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+
+    XVFMUL VX0, VXAI, x2
+    XVFMUL VX1, VXAI, x1
+    XVFMUL VX2, VXBI, x4
+    XVFMUL VX3, VXBI, x3
+    XVMSUB VX0, VXAR, x1, VX0
+    XVFMADD VX1, VXAR, x2, VX1
+    XVMSUB VX2, VXBR, x3, VX2
+    XVFMADD VX3, VXBR, x4, VX3
+    XVFADD x3, VX0, VX2
+    XVFADD x4, VX1, VX3
+    addi.d  I, I, -1
+
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#endif
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    addi.d I, I, -1
+    MUL s1, ALPHAI, a2
+    MUL s2, ALPHAI, a1
+    MUL s3, BETAI, a4
+    MUL s4, BETAI, a3
+    MSUB s1, ALPHAR, a1, s1
+    MADD s2, a2, ALPHAR, s2
+    MSUB s3, BETAR, a3, s3
+    MADD s4, a4, BETAR, s4
+    ADD s3, s3, s1
+    ADD s4, s4, s2
+    ST s3, Y, 0 * SIZE
+    ST s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/caxpby_lsx.S b/kernel/loongarch64/caxpby_lsx.S
new file mode 100644
index 000000000..247ae428e
--- /dev/null
+++ b/kernel/loongarch64/caxpby_lsx.S
@@ -0,0 +1,1029 @@
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r5
+#define INCX   $r6
+#define BETAR  $f2
+#define BETAI  $f3
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define VXBR   $vr14
+#define VXBI   $vr13
+#define VXZ    $vr12
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    movgr2fr.d a1, $r0
+#ifdef DOUBLE
+    ffint.d.l a1, a1
+#else
+    ffint.s.l a1, a1
+#endif
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    movfr2gr.d t1, ALPHAR
+    vreplgr2vr.d VXAR, t1
+    movfr2gr.d t2, ALPHAI
+    vreplgr2vr.d VXAI, t2
+    movfr2gr.d t3, BETAR
+    vreplgr2vr.d VXBR, t3
+    movfr2gr.d t4, BETAI
+    vreplgr2vr.d VXBI, t4
+#else
+    movfr2gr.s t1, ALPHAR
+    vreplgr2vr.w VXAR, t1
+    movfr2gr.s t2, ALPHAI
+    vreplgr2vr.w VXAI, t2
+    movfr2gr.s t3, BETAR
+    vreplgr2vr.w VXBR, t3
+    movfr2gr.s t4, BETAI
+    vreplgr2vr.w VXBI, t4
+#endif
+    vxor.v VXZ, VXZ, VXZ
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+#ifdef DOUBLE
+    fcmp.ceq.d $fcc0, BETAR, a1
+    fcmp.ceq.d $fcc1, BETAI, a1
+    fcmp.ceq.d $fcc2, ALPHAR, a1
+    fcmp.ceq.d $fcc3, ALPHAI, a1
+#else
+    fcmp.ceq.s $fcc0, BETAR, a1
+    fcmp.ceq.s $fcc1, BETAI, a1
+    fcmp.ceq.s $fcc2, ALPHAR, a1
+    fcmp.ceq.s $fcc3, ALPHAI, a1
+#endif
+    bceqz $fcc0, .L13
+    bceqz $fcc1, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc2, .L114
+    bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+
+.L14:
+    bceqz $fcc2, .L112
+    bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L111:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 2 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    vst VXZ, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+#else
+    vst VXZ, Y, 0 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+#endif
+
+.L112:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXAI, x2
+    vfmul.s x4, VXAI, x1
+    vfmsub.s x3, VXAR, x1, x3
+    vfmadd.s x4, VXAR, x2, x4
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+#endif
+
+.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vld VX0, Y, 0 * SIZE
+    vld VX1, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, Y, 4 * SIZE
+    vld VX1, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+#else
+    vld VX0, Y, 0 * SIZE
+    vld VX1, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXBI, x2
+    vfmul.s x4, VXBI, x1
+    vfmsub.s x3, VXBR, x1, x3
+    vfmadd.s x4, VXBR, x2, x4
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+#endif
+
+.L114:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+#endif
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+#endif
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+#endif
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+#ifdef DOUBLE
+    fcmp.ceq.d $fcc0, BETAR, a1
+    fcmp.ceq.d $fcc1, BETAI, a1
+    fcmp.ceq.d $fcc2, ALPHAR, a1
+    fcmp.ceq.d $fcc3, ALPHAI, a1
+#else
+    fcmp.ceq.s $fcc0, BETAR, a1
+    fcmp.ceq.s $fcc1, BETAI, a1
+    fcmp.ceq.s $fcc2, ALPHAR, a1
+    fcmp.ceq.s $fcc3, ALPHAI, a1
+#endif
+    bceqz $fcc0, .L23
+    bceqz $fcc1, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc2, .L224
+    bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L24:
+    bceqz $fcc2, .L222
+    bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+    b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+    .align 3
+
+.L221:  //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VXZ, Y, 0, 0
+    vstelm.d VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#else
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VXZ, Y, 0, 0
+    vstelm.w VXZ, Y, 0, 0
+    add.d Y, Y, INCY
+    addi.d I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+#endif
+
+.L222:  //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d x3, VXAI, x2
+    vfmul.d x4, VXAI, x1
+    vfmsub.d x3, VXAR, x1, x3
+    vfmadd.d x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    vfmul.s x3, VXAI, x2
+    vfmul.s x4, VXAI, x1
+    vfmsub.s x3, VXAR, x1, x3
+    vfmadd.s x4, VXAR, x2, x4
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+#endif
+
+.L223:
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d x3, VXBI, x2
+    vfmul.d x4, VXBI, x1
+    vfmsub.d x3, VXBR, x1, x3
+    vfmadd.d x4, VXBR, x2, x4
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s x3, VXBI, x2
+    vfmul.s x4, VXBI, x1
+    vfmsub.s x3, VXBR, x1, x3
+    vfmadd.s x4, VXBR, x2, x4
+
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+#endif
+
+.L224:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX1, VXAI, x1
+    vfmul.d VX2, VXBI, x4
+    vfmul.d VX3, VXBI, x3
+    vfmsub.d VX0, VXAR, x1, VX0
+    vfmadd.d VX1, VXAR, x2, VX1
+    vfmsub.d VX2, VXBR, x3, VX2
+    vfmadd.d VX3, VXBR, x4, VX3
+    vfadd.d x3, VX0, VX2
+    vfadd.d x4, VX1, VX3
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX1, VXAI, x1
+    vfmul.s VX2, VXBI, x4
+    vfmul.s VX3, VXBI, x3
+    vfmsub.s VX0, VXAR, x1, VX0
+    vfmadd.s VX1, VXAR, x2, VX1
+    vfmsub.s VX2, VXBR, x3, VX2
+    vfmadd.s VX3, VXBR, x4, VX3
+    vfadd.s x3, VX0, VX2
+    vfadd.s x4, VX1, VX3
+    addi.d  I, I, -1
+
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+#endif
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+#ifdef DOUBLE
+    fld.d a1, X, 0 * SIZE
+    fld.d a2, X, 1 * SIZE
+    fld.d a3, Y, 0 * SIZE
+    fld.d a4, Y, 1 * SIZE
+    addi.d I, I, -1
+    fmul.d s1, ALPHAI, a2
+    fmul.d s2, ALPHAI, a1
+    fmul.d s3, BETAI, a4
+    fmul.d s4, BETAI, a3
+    fmsub.d s1, ALPHAR, a1, s1
+    fmadd.d s2, a2, ALPHAR, s2
+    fmsub.d s3, BETAR, a3, s3
+    fmadd.d s4, a4, BETAR, s4
+    fadd.d s3, s3, s1
+    fadd.d s4, s4, s2
+    fst.d s3, Y, 0 * SIZE
+    fst.d s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+#else
+    fld.s a1, X, 0 * SIZE
+    fld.s a2, X, 1 * SIZE
+    fld.s a3, Y, 0 * SIZE
+    fld.s a4, Y, 1 * SIZE
+    addi.d I, I, -1
+    fmul.s s1, ALPHAI, a2
+    fmul.s s2, ALPHAI, a1
+    fmul.s s3, BETAI, a4
+    fmul.s s4, BETAI, a3
+    fmsub.s s1, ALPHAR, a1, s1
+    fmadd.s s2, a2, ALPHAR, s2
+    fmsub.s s3, BETAR, a3, s3
+    fmadd.s s4, a4, BETAR, s4
+    fadd.s s3, s3, s1
+    fadd.s s4, s4, s2
+    fst.s s3, Y, 0 * SIZE
+    fst.s s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+#endif
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/caxpy_lasx.S b/kernel/loongarch64/caxpy_lasx.S
new file mode 100644
index 000000000..2b970fe70
--- /dev/null
+++ b/kernel/loongarch64/caxpy_lasx.S
@@ -0,0 +1,707 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L10
+    bcnez $fcc1, .L999
+.L10:
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+    MTG  t2, ALPHAI
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    xvreplgr2vr.d VXAI, t2
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXAR, t1
+    xvreplgr2vr.w VXAI, t2
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    xvld VX1, X, 8 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+#else
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 16 * SIZE
+#endif
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvld VX3, Y, 8 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+    addi.d  I, I, -1
+#ifdef DOUBLE
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+#else
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    LD  a3, Y, 0 * SIZE
+    LD  a4, Y, 1 * SIZE
+    addi.d I, I, -1
+#if !defined(CONJ)
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s3, ALPHAR, a1, s1
+    MADD s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    ADD  s4, s4, a4
+#else
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MADD s3, ALPHAR, a1, s1
+    MSUB s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    SUB  s4, a4, s4
+#endif
+    ST  s3, Y, 0 * SIZE
+    ST  s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/caxpy_lsx.S b/kernel/loongarch64/caxpy_lsx.S
new file mode 100644
index 000000000..85598d0b9
--- /dev/null
+++ b/kernel/loongarch64/caxpy_lsx.S
@@ -0,0 +1,679 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L10
+    bcnez $fcc1, .L999
+.L10:
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+    MTG  t2, ALPHAI
+#ifdef DOUBLE
+    vreplgr2vr.d VXAR, t1
+    vreplgr2vr.d VXAI, t2
+#else
+    vreplgr2vr.w VXAR, t1
+    vreplgr2vr.w VXAI, t2
+#endif
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+#else
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+#endif
+    add.d Y, Y, INCY
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+#else
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    LD  a3, Y, 0 * SIZE
+    LD  a4, Y, 1 * SIZE
+    addi.d I, I, -1
+#if !defined(CONJ)
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s3, ALPHAR, a1, s1
+    MADD s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    ADD  s4, s4, a4
+#else
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MADD s3, ALPHAR, a1, s1
+    MSUB s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    SUB  s4, a4, s4
+#endif
+    ST  s3, Y, 0 * SIZE
+    ST  s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/ccopy_lasx.S b/kernel/loongarch64/ccopy_lasx.S
new file mode 100644
index 000000000..fbc5d96bc
--- /dev/null
+++ b/kernel/loongarch64/ccopy_lasx.S
@@ -0,0 +1,386 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    xvst VX2, Y, 8 * SIZE
+    xvst VX3, Y, 12 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 0
+    xvstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0 * SIZE, 2
+    xvstelm.d VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0 * SIZE, 0
+    xvstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0 * SIZE, 2
+    xvstelm.d VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX2, Y, 0 * SIZE, 0
+    xvstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX2, Y, 0 * SIZE, 2
+    xvstelm.d VX2, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX3, Y, 0 * SIZE, 0
+    xvstelm.d VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX3, Y, 0 * SIZE, 2
+    xvstelm.d VX3, Y, 1 * SIZE, 3
+#else
+    xvld VX1, X, 8 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 0
+    xvstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 2
+    xvstelm.w VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 4
+    xvstelm.w VX0, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 6
+    xvstelm.w VX0, Y, 1 * SIZE, 7
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 0
+    xvstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 2
+    xvstelm.w VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 4
+    xvstelm.w VX1, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 6
+    xvstelm.w VX1, Y, 1 * SIZE, 7
+#endif
+    add.d Y, Y, INCY
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    xvst VX2, Y, 8 * SIZE
+    xvst VX3, Y, 12 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/ccopy_lsx.S b/kernel/loongarch64/ccopy_lsx.S
new file mode 100644
index 000000000..4c4d880f1
--- /dev/null
+++ b/kernel/loongarch64/ccopy_lsx.S
@@ -0,0 +1,411 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:// INCX==1 and INCY==1
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 8 * SIZE
+    vst VX1, Y, 10 * SIZE
+    vst VX2, Y, 12 * SIZE
+    vst VX3, Y, 14 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    vst VX2, Y, 8 * SIZE
+    vst VX3, Y, 12 * SIZE
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    addi.d Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:   // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX2, Y, 0 * SIZE, 0
+    vstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX3, Y, 0 * SIZE, 0
+    vstelm.d VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX2, Y, 0 * SIZE, 0
+    vstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX3, Y, 0 * SIZE, 0
+    vstelm.d VX3, Y, 1 * SIZE, 1
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 0
+    vstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0 * SIZE, 2
+    vstelm.w VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0 * SIZE, 0
+    vstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0 * SIZE, 2
+    vstelm.w VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX2, Y, 0 * SIZE, 0
+    vstelm.w VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX2, Y, 0 * SIZE, 2
+    vstelm.w VX2, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX3, Y, 0 * SIZE, 0
+    vstelm.w VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX3, Y, 0 * SIZE, 2
+    vstelm.w VX3, Y, 1 * SIZE, 3
+#endif
+    add.d Y, Y, INCY
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 4 * SIZE
+    vst VX1, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 8 * SIZE
+    vst VX1, Y, 10 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 12 * SIZE
+    vst VX1, Y, 14 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    vst VX2, Y, 8 * SIZE
+    vst VX3, Y, 12 * SIZE
+#endif
+    addi.d Y, Y, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cdot_lasx.S b/kernel/loongarch64/cdot_lasx.S
new file mode 100644
index 000000000..0583e56ea
--- /dev/null
+++ b/kernel/loongarch64/cdot_lasx.S
@@ -0,0 +1,565 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r19
+#define TEMP   $r10
+#define t1     $r11
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define res1   $xr16
+#define res2   $xr17
+#define res3   $xr18
+#define res4   $xr19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define x1     $xr20
+#define x2     $xr21
+#define x3     $xr22
+#define x4     $xr23
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res3, res3, res3
+    xvxor.v res4, res4, res4
+    bge $r0, N, .L999
+    li.d  TEMP, 2 * SIZE
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 2
+#else
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L996
+    .align 3
+
+.L12:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    addi.d X, X, 8 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    xvld VX1, X, 8 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    addi.d X, X, 16 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L21:
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    addi.d Y, Y, 8 * SIZE
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvld VX3, Y, 8 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    addi.d Y, Y, 8 * SIZE
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L996
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L996:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+    xvpickve.d VX1, res2, 1
+    xvpickve.d VX2, res2, 2
+    xvpickve.d VX3, res2, 3
+    xvfadd.d res2, VX1, res2
+    xvfadd.d res2, VX2, res2
+    xvfadd.d res2, VX3, res2
+    xvpickve.d VX1, res3, 1
+    xvpickve.d VX2, res3, 2
+    xvpickve.d VX3, res3, 3
+    xvfadd.d res3, VX1, res3
+    xvfadd.d res3, VX2, res3
+    xvfadd.d res3, VX3, res3
+    xvpickve.d VX1, res4, 1
+    xvpickve.d VX2, res4, 2
+    xvpickve.d VX3, res4, 3
+    xvfadd.d res4, VX1, res4
+    xvfadd.d res4, VX2, res4
+    xvfadd.d res4, VX3, res4
+#else
+    xvpickve.w VX0, res1, 1
+    xvpickve.w VX1, res1, 2
+    xvpickve.w VX2, res1, 3
+    xvpickve.w VX3, res1, 4
+    xvpickve.w x1, res1, 5
+    xvpickve.w x2, res1, 6
+    xvpickve.w x3, res1, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvfadd.s res1, x1, res1
+    xvfadd.s res1, x2, res1
+    xvfadd.s res1, x3, res1
+    xvpickve.w VX0, res2, 1
+    xvpickve.w VX1, res2, 2
+    xvpickve.w VX2, res2, 3
+    xvpickve.w VX3, res2, 4
+    xvpickve.w x1, res2, 5
+    xvpickve.w x2, res2, 6
+    xvpickve.w x3, res2, 7
+    xvfadd.s res2, VX0, res2
+    xvfadd.s res2, VX1, res2
+    xvfadd.s res2, VX2, res2
+    xvfadd.s res2, VX3, res2
+    xvfadd.s res2, x1, res2
+    xvfadd.s res2, x2, res2
+    xvfadd.s res2, x3, res2
+    xvpickve.w VX0, res3, 1
+    xvpickve.w VX1, res3, 2
+    xvpickve.w VX2, res3, 3
+    xvpickve.w VX3, res3, 4
+    xvpickve.w x1, res3, 5
+    xvpickve.w x2, res3, 6
+    xvpickve.w x3, res3, 7
+    xvfadd.s res3, VX0, res3
+    xvfadd.s res3, VX1, res3
+    xvfadd.s res3, VX2, res3
+    xvfadd.s res3, VX3, res3
+    xvfadd.s res3, x1, res3
+    xvfadd.s res3, x2, res3
+    xvfadd.s res3, x3, res3
+    xvpickve.w VX0, res4, 1
+    xvpickve.w VX1, res4, 2
+    xvpickve.w VX2, res4, 3
+    xvpickve.w VX3, res4, 4
+    xvpickve.w x1, res4, 5
+    xvpickve.w x2, res4, 6
+    xvpickve.w x3, res4, 7
+    xvfadd.s res4, VX0, res4
+    xvfadd.s res4, VX1, res4
+    xvfadd.s res4, VX2, res4
+    xvfadd.s res4, VX3, res4
+    xvfadd.s res4, x1, res4
+    xvfadd.s res4, x2, res4
+    xvfadd.s res4, x3, res4
+#endif
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MADD s1, a1, a3, s1
+    MADD s2, a2, a3, s2
+    MADD s3, a1, a4, s3
+    MADD s4, a2, a4, s4
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+#ifndef CONJ
+    SUB $f0, s1, s4
+    ADD $f1, s3, s2
+#else
+    ADD $f0, s1, s4
+    SUB $f1, s3, s2
+#endif
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cdot_lsx.S b/kernel/loongarch64/cdot_lsx.S
new file mode 100644
index 000000000..5feea12be
--- /dev/null
+++ b/kernel/loongarch64/cdot_lsx.S
@@ -0,0 +1,397 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r19
+#define TEMP   $r10
+#define t1     $r11
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+#define res4   $vr19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define x1     $vr20
+#define x2     $vr21
+#define x3     $vr22
+#define x4     $vr23
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res3, res3, res3
+    vxor.v res4, res4, res4
+    bge $r0, N, .L999
+    li.d  TEMP, 2 * SIZE
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 1
+#else
+    srai.d I, N, 2
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L996
+    .align 3
+
+.L12:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    addi.d X, X, 4 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    addi.d X, X, 8 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L21:
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    addi.d Y, Y, 4 * SIZE
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    addi.d Y, Y, 8 * SIZE
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L996
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L996:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    vreplvei.d VX1, res2, 1
+    vfadd.d res2, VX1, res2
+    vreplvei.d VX1, res3, 1
+    vfadd.d res3, VX1, res3
+    vreplvei.d VX1, res4, 1
+    vfadd.d res4, VX1, res4
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+    vreplvei.w VX1, res2, 1
+    vreplvei.w VX2, res2, 2
+    vreplvei.w VX3, res2, 3
+    vfadd.s res2, VX1, res2
+    vfadd.s res2, VX2, res2
+    vfadd.s res2, VX3, res2
+    vreplvei.w VX1, res3, 1
+    vreplvei.w VX2, res3, 2
+    vreplvei.w VX3, res3, 3
+    vfadd.s res3, VX1, res3
+    vfadd.s res3, VX2, res3
+    vfadd.s res3, VX3, res3
+    vreplvei.w VX1, res4, 1
+    vreplvei.w VX2, res4, 2
+    vreplvei.w VX3, res4, 3
+    vfadd.s res4, VX1, res4
+    vfadd.s res4, VX2, res4
+    vfadd.s res4, VX3, res4
+#endif
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 1
+#else
+    andi I, N, 3
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MADD s1, a1, a3, s1
+    MADD s2, a2, a3, s2
+    MADD s3, a1, a4, s3
+    MADD s4, a2, a4, s4
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+#ifndef CONJ
+    SUB $f0, s1, s4
+    ADD $f1, s3, s2
+#else
+    ADD $f0, s1, s4
+    SUB $f1, s3, s2
+#endif
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cgemm_kernel_16x4_lasx.S b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S
new file mode 100644
index 000000000..249abe102
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_16x4_lasx.S
@@ -0,0 +1,3757 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define VALPHAR $xr28
+#define VALPHAI $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.w  VALPHAR, $sp, 112
+    xvldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     4  //bm/16
+    beq        I,      T0,    .L11
+
+.L101:  /* for(i=0; i<bm/16; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x07
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   16
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+    xvxor.v    U8,     U8,   U8
+    xvxor.v    U9,     U9,   U9
+    xvxor.v    U10,     U10,   U10
+    xvxor.v    U11,     U11,   U11
+    xvxor.v    U12,     U12,   U12
+    xvxor.v    U13,     U13,   U13
+    xvxor.v    U14,     U14,   U14
+    xvxor.v    U15,     U15,   U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L103
+    blt        TL,     L,     .L103
+
+.L102:  /* for(k=0; k<temp; k+=1) */
+    xvld       D2,     B0,    0x00  // b0ri b1ri b2ri b3ri
+
+    vldrepl.w  $vr20,  A0,    0x00
+    vldrepl.w  $vr16,  A0,    0x08
+
+    vldrepl.w  $vr21,  A0,    0x04
+    vldrepl.w  $vr19,  A0,    0x0c
+
+    xvpermi.q  D4,     D0,    0x02  //a0rrrr a1rrrr
+    xvpermi.q  D5,     D3,    0x02  //a0iiii a1iiii
+
+    xvpickev.w D6,     D2,    D2    //b0r b1r b0r b1r b2r b3r b2r b3r
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r b0r b1r b2r b3r
+
+    xvpickod.w D7,     D2,    D2    //b0i b1i b0i b1i b2i b3i b2i b3i
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r 01r 11r 21r 31r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i 01i 11i 21i 31i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    vldrepl.w  $vr20,  A0,    0x10
+    vldrepl.w  $vr16,  A0,    0x18
+
+    vldrepl.w  $vr21,  A0,    0x14
+    vldrepl.w  $vr19,  A0,    0x1c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U2,     D4,    D6,     U2  //02r 12r 22r 32r 03r 13r 23r 33r
+    XVMADD2    U3,     D5,    D6,     U3  //02i 12i 22i 32i 03i 13i 23i 33i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    vldrepl.w  $vr20,  A0,    0x20
+    vldrepl.w  $vr16,  A0,    0x28
+
+    vldrepl.w  $vr21,  A0,    0x24
+    vldrepl.w  $vr19,  A0,    0x2c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U4,     D4,    D6,     U4  //04r 14r 24r 34r 05r 15r 25r 35r
+    XVMADD2    U5,     D5,    D6,     U5  //04i 14i 24i 34i 05i 15i 25i 35i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    vldrepl.w  $vr20,  A0,    0x30
+    vldrepl.w  $vr16,  A0,    0x38
+
+    vldrepl.w  $vr21,  A0,    0x34
+    vldrepl.w  $vr19,  A0,    0x3c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U6,     D4,    D6,     U6  //06r 16r 26r 36r 07r 17r 27r 37r
+    XVMADD2    U7,     D5,    D6,     U7  //06i 16i 26i 36i 07i 17i 27i 37i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    vldrepl.w  $vr20,  A0,    0x40
+    vldrepl.w  $vr16,  A0,    0x48
+
+    vldrepl.w  $vr21,  A0,    0x44
+    vldrepl.w  $vr19,  A0,    0x4c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U8,     D4,    D6,     U8  //08r 18r 28r 38r 09r 19r 29r 39r
+    XVMADD2    U9,     D5,    D6,     U9  //08i 18i 28i 38i 09i 19i 29i 39i
+    XVMADD3    U8,     D5,    D7,     U8
+    XVMADD4    U9,     D4,    D7,     U9
+
+    vldrepl.w  $vr20,  A0,    0x50
+    vldrepl.w  $vr16,  A0,    0x58
+
+    vldrepl.w  $vr21,  A0,    0x54
+    vldrepl.w  $vr19,  A0,    0x5c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U10,     D4,    D6,     U10  //0ar 1ar 2ar 3ar 0br 1br 2br 3br
+    XVMADD2    U11,     D5,    D6,     U11  //0ai 1ai 2ai 3ai 0bi 1bi 2bi 3bi
+    XVMADD3    U10,     D5,    D7,     U10
+    XVMADD4    U11,     D4,    D7,     U11
+
+    vldrepl.w  $vr20,  A0,    0x60
+    vldrepl.w  $vr16,  A0,    0x68
+
+    vldrepl.w  $vr21,  A0,    0x64
+    vldrepl.w  $vr19,  A0,    0x6c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U12,     D4,    D6,     U12  //0cr 1cr 2cr 3cr 0dr 1dr 2dr 3dr
+    XVMADD2    U13,     D5,    D6,     U13  //0ci 1ci 2ci 3ci 0di 1di 2di 3di
+    XVMADD3    U12,     D5,    D7,     U12
+    XVMADD4    U13,     D4,    D7,     U13
+
+    vldrepl.w  $vr20,  A0,    0x70
+    vldrepl.w  $vr16,  A0,    0x78
+
+    vldrepl.w  $vr21,  A0,    0x74
+    vldrepl.w  $vr19,  A0,    0x7c
+
+    xvpermi.q  D4,     D0,    0x02
+    xvpermi.q  D5,     D3,    0x02
+
+    XVMADD1    U14,     D4,    D6,     U14  //0er 1er 2er 3er 0fr 1fr 2fr 3fr
+    XVMADD2    U15,     D5,    D6,     U15  //0ei 1ei 2ei 3ei 0fi 1fi 2fi 3fi
+    XVMADD3    U14,     D5,    D7,     U14
+    XVMADD4    U15,     D4,    D7,     U15
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L102
+
+.L103:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    xvfmul.s      D6,    U0,    VALPHAR
+    xvfmul.s      D8,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    xvfmul.s      D7,    U2,    VALPHAR
+    xvfmul.s      D9,    U3,    VALPHAR
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    //res04 res14 res24 res34 res05 res15 res25 res35
+    //res06 res16 res26 res36 res07 res17 res27 res37
+    xvld       D0,     C0,    0x20
+    xvld       D1,     C1,    0x20
+    xvld       D2,     C2,    0x20
+    xvld       D3,     C3,    0x20
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    xvfmul.s      D6,    U4,    VALPHAR
+    xvfmul.s      D8,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D8,    U4,    VALPHAI, D8
+
+    xvfmul.s      D7,    U6,    VALPHAR
+    xvfmul.s      D9,    U7,    VALPHAR
+    XVNMSUB      D7,    U7,    VALPHAI, D7
+    XVFMADD      D9,    U6,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x20
+    xvst       D2,     C1,    0x20
+    xvst       D1,     C2,    0x20
+    xvst       D3,     C3,    0x20
+
+    //res08 res18 res28 res38 res09 res19 res29 res39
+    //res0a res1a res2a res3a res0b res1b res2b res3b
+    xvld       D0,     C0,    0x40
+    xvld       D1,     C1,    0x40
+    xvld       D2,     C2,    0x40
+    xvld       D3,     C3,    0x40
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    xvfmul.s      D6,    U8,    VALPHAR
+    xvfmul.s      D8,    U9,    VALPHAR
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D8,    U8,    VALPHAI, D8
+
+    xvfmul.s      D7,    U10,    VALPHAR
+    xvfmul.s      D9,    U11,    VALPHAR
+    XVNMSUB      D7,    U11,    VALPHAI, D7
+    XVFMADD      D9,    U10,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x40
+    xvst       D2,     C1,    0x40
+    xvst       D1,     C2,    0x40
+    xvst       D3,     C3,    0x40
+
+    //res0c res1c res2c res3c res0d res1d res2d res3d
+    //res0e res1e res2e res3e res0f res1f res2f res3f
+    xvld       D0,     C0,    0x60
+    xvld       D1,     C1,    0x60
+    xvld       D2,     C2,    0x60
+    xvld       D3,     C3,    0x60
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    xvfmul.s      D6,    U12,    VALPHAR
+    xvfmul.s      D8,    U13,    VALPHAR
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D8,    U12,    VALPHAI, D8
+
+    xvfmul.s      D7,    U14,    VALPHAR
+    xvfmul.s      D9,    U15,    VALPHAR
+    XVNMSUB      D7,    U15,    VALPHAI, D7
+    XVFMADD      D9,    U14,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x60
+    xvst       D2,     C1,    0x60
+    xvst       D1,     C2,    0x60
+    xvst       D3,     C3,    0x60
+
+    addi.d     C0,     C0,    0x80
+    addi.d     C1,     C1,    0x80
+    addi.d     C2,     C2,    0x80
+    addi.d     C3,     C3,    0x80
+#else
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D8,    U1,    VALPHAR, D8
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    XVFMADD      D7,    U2,    VALPHAR, D7
+    XVFMADD      D9,    U3,    VALPHAR, D9
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    //res04 res14 res24 res34 res05 res15 res25 res35
+    //res06 res16 res26 res36 res07 res17 res27 res37
+    xvld       D0,     C0,    0x20
+    xvld       D1,     C1,    0x20
+    xvld       D2,     C2,    0x20
+    xvld       D3,     C3,    0x20
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D8,    U5,    VALPHAR, D8
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D8,    U4,    VALPHAI, D8
+
+    XVFMADD      D7,    U6,    VALPHAR, D7
+    XVFMADD      D9,    U7,    VALPHAR, D9
+    XVNMSUB      D7,    U7,    VALPHAI, D7
+    XVFMADD      D9,    U6,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x20
+    xvst       D2,     C1,    0x20
+    xvst       D1,     C2,    0x20
+    xvst       D3,     C3,    0x20
+
+    //res08 res18 res28 res38 res09 res19 res29 res39
+    //res0a res1a res2a res3a res0b res1b res2b res3b
+    xvld       D0,     C0,    0x40
+    xvld       D1,     C1,    0x40
+    xvld       D2,     C2,    0x40
+    xvld       D3,     C3,    0x40
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    XVFMADD      D6,    U8,    VALPHAR, D6
+    XVFMADD      D8,    U9,    VALPHAR, D8
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D8,    U8,    VALPHAI, D8
+
+    XVFMADD      D7,    U10,    VALPHAR, D7
+    XVFMADD      D9,    U11,    VALPHAR, D9
+    XVNMSUB      D7,    U11,    VALPHAI, D7
+    XVFMADD      D9,    U10,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x40
+    xvst       D2,     C1,    0x40
+    xvst       D1,     C2,    0x40
+    xvst       D3,     C3,    0x40
+
+    //res0c res1c res2c res3c res0d res1d res2d res3d
+    //res0e res1e res2e res3e res0f res1f res2f res3f
+    xvld       D0,     C0,    0x60
+    xvld       D1,     C1,    0x60
+    xvld       D2,     C2,    0x60
+    xvld       D3,     C3,    0x60
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    XVFMADD      D6,    U12,    VALPHAR, D6
+    XVFMADD      D8,    U13,    VALPHAR, D8
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D8,    U12,    VALPHAI, D8
+
+    XVFMADD      D7,    U14,    VALPHAR, D7
+    XVFMADD      D9,    U15,    VALPHAR, D9
+    XVNMSUB      D7,    U15,    VALPHAI, D7
+    XVFMADD      D9,    U14,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x60
+    xvst       D2,     C1,    0x60
+    xvst       D1,     C2,    0x60
+    xvst       D3,     C3,    0x60
+
+    addi.d     C0,     C0,    0x80
+    addi.d     C1,     C1,    0x80
+    addi.d     C2,     C2,    0x80
+    addi.d     C3,     C3,    0x80
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -16
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   16
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L101
+
+.L11:  /* if ( bm & 8 ) */
+    move       I,      $r0
+    andi       T0,     M,     8
+    beq        I,      T0,    .L150
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri a2ri a3ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri b2ri b3ri
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x02  //a0ri a1ri a0ri a1ri
+    xvpermi.d  D1,     D1,    0xd8  //a0ri a0ri a1ri a1ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a0rrrr a1rrrr
+    xvpermi.w  D5,     D5,    0x55  //a0iiii a1iiii
+
+    xvpackev.w D6,     D2,    D2    //b0rr b1rr b2rr b3rr
+    xvpermi.w  D6,     D6,    0x88  //b0r b1r b0r b1r b2r b3r b2r b3r
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r b0r b1r b2r b3r
+
+    xvpackod.w D7,     D2,    D2    //b0ii b1ii b2ii b3ii
+    xvpermi.w  D7,     D7,    0x88  //b0i b1i b0i b1i b2i b3i b2i b3i
+    xvpermi.d  D7,     D7,    0xd8  //b0r b1r b2r b3r b0r b1r b2r b3r
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r 01r 11r 21r 31r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i 01i 11i 21i 31i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x31  //a0ri a1ri a0ri a1ri
+    xvpermi.d  D1,     D1,    0xd8  //a0ri a0ri a1ri a1ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a0rrrr a1rrrr
+    xvpermi.w  D5,     D5,    0x55  //a0iiii a1iiii
+
+    XVMADD1    U2,     D4,    D6,     U2  //02r 12r 22r 32r 03r 13r 23r 33r
+    XVMADD2    U3,     D5,    D6,     U3  //02i 12i 22i 32i 03i 13i 23i 33i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x20  // a4ri a5ri a6ri a7ri
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x02  //a4ri a5ri a4ri a5ri
+    xvpermi.d  D1,     D1,    0xd8  //a4ri a4ri a5ri a5ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a4rrrr a5rrrr
+    xvpermi.w  D5,     D5,    0x55  //a4iiii a5iiii
+
+    XVMADD1    U4,     D4,    D6,     U4  //04r 14r 24r 34r 05r 15r 25r 35r
+    XVMADD2    U5,     D5,    D6,     U5  //04i 14i 24i 34i 05i 15i 25i 35i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x31  //a6ri a7ri a6ri a7ri
+    xvpermi.d  D1,     D1,    0xd8  //a6ri a6ri a7ri a7ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a6rrrr a7rrrr
+    xvpermi.w  D5,     D5,    0x55  //a6iiii a7iiii
+
+    XVMADD1    U6,     D4,    D6,     U6  //06r 16r 26r 36r 07r 17r 27r 37r
+    XVMADD2    U7,     D5,    D6,     U7  //06i 16i 26i 36i 07i 17i 27i 37i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    xvfmul.s      D6,    U0,    VALPHAR
+    xvfmul.s      D8,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    xvfmul.s      D7,    U2,    VALPHAR
+    xvfmul.s      D9,    U3,    VALPHAR
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    //res04 res14 res24 res34 res05 res15 res25 res35
+    //res06 res16 res26 res36 res07 res17 res27 res37
+    xvld       D0,     C0,    0x20
+    xvld       D1,     C1,    0x20
+    xvld       D2,     C2,    0x20
+    xvld       D3,     C3,    0x20
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    xvfmul.s      D6,    U4,    VALPHAR
+    xvfmul.s      D8,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D8,    U4,    VALPHAI, D8
+
+    xvfmul.s      D7,    U6,    VALPHAR
+    xvfmul.s      D9,    U7,    VALPHAR
+    XVNMSUB      D7,    U7,    VALPHAI, D7
+    XVFMADD      D9,    U6,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x20
+    xvst       D2,     C1,    0x20
+    xvst       D1,     C2,    0x20
+    xvst       D3,     C3,    0x20
+
+    addi.d     C0,     C0,    0x40
+    addi.d     C1,     C1,    0x40
+    addi.d     C2,     C2,    0x40
+    addi.d     C3,     C3,    0x40
+#else
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D8,    U1,    VALPHAR, D8
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    XVFMADD      D7,    U2,    VALPHAR, D7
+    XVFMADD      D9,    U3,    VALPHAR, D9
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    //res04 res14 res24 res34 res05 res15 res25 res35
+    //res06 res16 res26 res36 res07 res17 res27 res37
+    xvld       D0,     C0,    0x20
+    xvld       D1,     C1,    0x20
+    xvld       D2,     C2,    0x20
+    xvld       D3,     C3,    0x20
+
+    xvpackev.w D4,     D1,    D0
+    xvpackev.w D5,     D3,    D2
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02
+    xvpermi.d  D6,     D6,    0xd8
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31
+    xvpermi.d  D7,     D7,    0xd8
+
+    xvpackod.w D4,     D1,    D0
+    xvpackod.w D5,     D3,    D2
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02
+    xvpermi.d  D8,     D8,    0xd8
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31
+    xvpermi.d  D9,     D9,    0xd8
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D8,    U5,    VALPHAR, D8
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D8,    U4,    VALPHAI, D8
+
+    XVFMADD      D7,    U6,    VALPHAR, D7
+    XVFMADD      D9,    U7,    VALPHAR, D9
+    XVNMSUB      D7,    U7,    VALPHAI, D7
+    XVFMADD      D9,    U6,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackev.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02
+    xvpermi.q  D1,     D4,    0x31
+
+    xvpackod.w D4,     D8,    D6
+    xvpermi.d  D4,     D4,    0xd8
+    xvpackod.w D5,     D9,    D7
+    xvpermi.d  D5,     D5,    0xd8
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02
+    xvpermi.q  D3,     D4,    0x31
+
+    xvst       D0,     C0,    0x20
+    xvst       D2,     C1,    0x20
+    xvst       D1,     C2,    0x20
+    xvst       D3,     C3,    0x20
+
+    addi.d     C0,     C0,    0x40
+    addi.d     C1,     C1,    0x40
+    addi.d     C2,     C2,    0x40
+    addi.d     C3,     C3,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     4
+    beq        I,      T0,    .L18
+
+.L15:  /* if (bm & 4) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri a2ri a3ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri b2ri b3ri
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x02  //a0ri a1ri a0ri a1ri
+    xvpermi.d  D1,     D1,    0xd8  //a0ri a0ri a1ri a1ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a0rrrr a1rrrr
+    xvpermi.w  D5,     D5,    0x55  //a0iiii a1iiii
+
+    xvpackev.w D6,     D2,    D2    //b0rr b1rr b2rr b3rr
+    xvpermi.w  D6,     D6,    0x88  //b0r b1r b0r b1r b2r b3r b2r b3r
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r b0r b1r b2r b3r
+
+    xvpackod.w D7,     D2,    D2    //b0ii b1ii b2ii b3ii
+    xvpermi.w  D7,     D7,    0x88  //b0i b1i b0i b1i b2i b3i b2i b3i
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r 01r 11r 21r 31r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i 01i 11i 21i 31i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvand.v    D1,     D0,    D0
+    xvpermi.q  D1,     D0,    0x31  //a2ri a3ri a2ri a3ri
+    xvpermi.d  D1,     D1,    0xd8  //a2ri a2ri a3ri a3ri
+
+    xvand.v    D4,     D1,    D1
+    xvand.v    D5,     D1,    D1
+    xvpermi.w  D4,     D4,    0x00  //a2rrrr a3rrrr
+    xvpermi.w  D5,     D5,    0x55  //a2iiii a3iiii
+
+    XVMADD1    U2,     D4,    D6,     U2  //02r 12r 22r 32r 03r 13r 23r 33r
+    XVMADD2    U3,     D5,    D6,     U3  //02i 12i 22i 32i 03i 13i 23i 33i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    xvfmul.s      D6,    U0,    VALPHAR
+    xvfmul.s      D8,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    xvfmul.s      D7,    U2,    VALPHAR
+    xvfmul.s      D9,    U3,    VALPHAR
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30 res01 res11 res21 res31
+    //res02 res12 res22 res32 res03 res13 res23 res33
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3 4 5 6 7
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackev.w D5,     D3,    D2   //c2[0] c3[0] c2[2] c3[2] c2[4] c3[4] c2[6] c3[6]
+
+    xvand.v    D6,     D4,    D4
+    xvpermi.q  D6,     D5,    0x02 //c0[0] c1[0] c0[2] c1[2] c2[0] c3[0] c2[2] 03[2]
+    xvpermi.d  D6,     D6,    0xd8 //c0[0] c1[0] c2[0] c3[0] c0[2] c1[2] c2[2] 03[2]
+
+    xvand.v    D7,     D5,    D5
+    xvpermi.q  D7,     D4,    0x31 //c0[4] c1[4] c0[6] c1[6] c2[4] c3[4] c2[6] 03[6]
+    xvpermi.d  D7,     D7,    0xd8 //c0[4] c1[4] c2[4] c3[4] c0[6] c1[6] c2[6] 03[6]
+
+    xvpackod.w D4,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+    xvpackod.w D5,     D3,    D2   //c2[1] c3[1] c2[3] c3[3] c2[5] c3[5] c2[7] c3[7]
+
+    xvand.v    D8,     D4,    D4
+    xvpermi.q  D8,     D5,    0x02 //c0[1] c1[1] c0[3] c1[3] c2[1] c3[1] c2[3] 03[3]
+    xvpermi.d  D8,     D8,    0xd8 //c0[1] c1[1] c2[1] c3[1] c0[3] c1[3] c2[3] 03[3]
+
+    xvand.v    D9,     D5,    D5
+    xvpermi.q  D9,     D4,    0x31 //c0[5] c1[5] c0[7] c1[7] c2[5] c3[5] c2[7] 03[7]
+    xvpermi.d  D9,     D9,    0xd8 //c0[5] c1[5] c2[5] c3[5] c0[7] c1[7] c2[7] 03[7]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D8,    U1,    VALPHAR, D8
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D8,    U0,    VALPHAI, D8
+
+    XVFMADD      D7,    U2,    VALPHAR, D7
+    XVFMADD      D9,    U3,    VALPHAR, D9
+    XVNMSUB      D7,    U3,    VALPHAI, D7
+    XVFMADD      D9,    U2,    VALPHAI, D9
+
+    xvpackev.w D4,     D8,    D6 //c0[0] c0[1] c2[0] c2[1] c0[2] c0[3] c2[2] c2[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[0] c0[1] c0[2] c0[3] c2[0] c2[1] c2[2] c2[3]
+    xvpackev.w D5,     D9,    D7 //c0[4] c0[5] c2[4] c2[5] c0[6] c0[7] c2[6] c2[7]
+    xvpermi.d  D5,     D5,    0xd8 //c0[4] c0[5] c0[6] c0[7] c2[4] c2[5] c2[6] c2[7]
+
+    xvand.v    D0,     D4,    D4
+    xvand.v    D1,     D5,    D5
+    xvpermi.q  D0,     D1,    0x02 //c0: 0 1 2 3 4 5 6 7
+    xvpermi.q  D1,     D4,    0x31 //c2: 0 1 2 3 4 5 6 7
+
+    xvpackod.w D4,     D8,    D6 //c1[0] c1[1] c3[0] c3[1] c1[2] c1[3] c3[2] c3[3]
+    xvpermi.d  D4,     D4,    0xd8 //c1[0] c1[1] c1[2] c1[3] c3[0] c3[1] c3[2] c3[3]
+    xvpackod.w D5,     D9,    D7 //c1[4] c1[5] c3[4] c3[5] c1[6] c1[7] c3[6] c3[7]
+    xvpermi.d  D5,     D5,    0xd8 //c1[4] c1[5] c1[6] c1[7] c3[4] c3[5] c3[6] c3[7]
+
+    xvand.v    D2,     D4,    D4
+    xvand.v    D3,     D5,    D5
+    xvpermi.q  D2,     D3,    0x02 //c1: 0 1 2 3 4 5 6 7
+    xvpermi.q  D3,     D4,    0x31 //c3: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D2,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr2,     $vr2,   $vr2
+    vxor.v    $vr3,     $vr3,   $vr3
+    vxor.v    $vr4,     $vr4,   $vr4
+    vxor.v    $vr5,     $vr5,   $vr5
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri a1ri
+    vld       $vr18,     B0,    0x00  // b0ri b1ri
+    vld       $vr19,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  $vr20,     $vr16,    0x00  //a0r
+    vshuf4i.w  $vr21,     $vr16,    0x55  //a0i
+
+    vpackev.w  $vr22,     $vr19,    $vr18
+    vshuf4i.w  $vr22,     $vr22,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  $vr23,     $vr19,    $vr18
+    vshuf4i.w  $vr23,     $vr23,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    $vr2,     $vr20,    $vr22,     $vr2  //00r 10r 20r 30r
+    VMADD2    $vr3,     $vr21,    $vr22,     $vr3  //00i 10i 20i 30i
+    VMADD3    $vr2,     $vr21,    $vr23,     $vr2
+    VMADD4    $vr3,     $vr20,    $vr23,     $vr3
+
+    vshuf4i.w  $vr20,     $vr16,    0xaa  //a1r
+    vshuf4i.w  $vr21,     $vr16,    0xff  //a1i
+
+    VMADD1    $vr4,     $vr20,    $vr22,     $vr4  //01r 11r 21r 31r
+    VMADD2    $vr5,     $vr21,    $vr22,     $vr5  //01i 11i 21i 31i
+    VMADD3    $vr4,     $vr21,    $vr23,     $vr4
+    VMADD4    $vr5,     $vr20,    $vr23,     $vr5
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C1,    0x00 //c1: 0 1 2 3
+    vld       $vr18,     C2,    0x00 //c2: 0 1 2 3
+    vld       $vr19,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    $vr20,     $vr17,    $vr17
+    vpermi.w  $vr20,     $vr16,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  $vr22,     $vr20,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  $vr23,     $vr20,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    $vr21,     $vr19,    $vr19
+    vpermi.w  $vr21,     $vr18,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  $vr24,     $vr21,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  $vr25,     $vr21,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  $vr24,     $vr22,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  $vr25,     $vr23,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      $vr24,    $vr2,    $vr28
+    vfmul.s      $vr25,    $vr3,    $vr28
+    VNMSUB      $vr24,    $vr3,    $vr29, $vr24
+    VFMADD      $vr25,    $vr2,    $vr29, $vr25
+
+    vand.v    $vr26,     $vr25,    $vr25 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    $vr27,     $vr25,    $vr25 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  $vr26,     $vr24,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  $vr26,     $vr26,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  $vr27,     $vr24,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  $vr27,     $vr27,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    $vr20,     $vr17,    $vr17
+    vpermi.w  $vr20,     $vr16,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  $vr22,     $vr20,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  $vr23,     $vr20,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    $vr21,     $vr19,    $vr19
+    vpermi.w  $vr21,     $vr18,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  $vr24,     $vr21,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  $vr25,     $vr21,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  $vr24,     $vr22,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  $vr25,     $vr23,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      $vr24,    $vr4,    $vr28
+    vfmul.s      $vr25,    $vr5,    $vr28
+    VNMSUB      $vr24,    $vr5,    $vr29, $vr24
+    VFMADD      $vr25,    $vr4,    $vr29, $vr25
+
+    vand.v    $vr20,     $vr25,    $vr25
+    vpermi.w  $vr20,     $vr24,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w $vr20,     $vr20,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    $vr18,     $vr20,    $vr20
+
+    vand.v    $vr21,     $vr25,    $vr25
+    vpermi.w  $vr21,     $vr24,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w $vr21,     $vr21,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    $vr19,     $vr21,    $vr21
+
+    vand.v    $vr16,     $vr26,    $vr26 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    $vr17,     $vr27,    $vr27 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  $vr20,     $vr16,     0x44 //c0: 0 1 2 3
+    vpermi.w  $vr18,     $vr16,     0xee //c1: 0 1 2 3
+    vpermi.w  $vr21,     $vr17,     0x44 //c2: 0 1 2 3
+    vpermi.w  $vr19,     $vr17,     0xee //c3: 0 1 2 3
+
+    vst       $vr20,     C0,    0x00
+    vst       $vr18,     C1,    0x00
+    vst       $vr21,     C2,    0x00
+    vst       $vr19,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C1,    0x00 //c1: 0 1 2 3
+    vld       $vr18,     C2,    0x00 //c2: 0 1 2 3
+    vld       $vr19,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    $vr20,     $vr17,    $vr17
+    vpermi.w  $vr20,     $vr16,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  $vr22,     $vr20,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  $vr23,     $vr20,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    $vr21,     $vr19,    $vr19
+    vpermi.w  $vr21,     $vr18,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  $vr24,     $vr21,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  $vr25,     $vr21,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  $vr24,     $vr22,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  $vr25,     $vr23,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      $vr24,    $vr2,    $vr28, $vr24
+    VFMADD      $vr25,    $vr3,    $vr28, $vr25
+    VNMSUB      $vr24,    $vr3,    $vr29, $vr24
+    VFMADD      $vr25,    $vr2,    $vr29, $vr25
+
+    vand.v    $vr26,     $vr25,    $vr25 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    $vr27,     $vr25,    $vr25 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  $vr26,     $vr24,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  $vr26,     $vr26,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  $vr27,     $vr24,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  $vr27,     $vr27,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    $vr20,     $vr17,    $vr17
+    vpermi.w  $vr20,     $vr16,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  $vr22,     $vr20,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  $vr23,     $vr20,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    $vr21,     $vr19,    $vr19
+    vpermi.w  $vr21,     $vr18,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  $vr24,     $vr21,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  $vr25,     $vr21,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  $vr24,     $vr22,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  $vr25,     $vr23,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      $vr24,    $vr4,    $vr28, $vr24
+    VFMADD      $vr25,    $vr5,    $vr28, $vr25
+    VNMSUB      $vr24,    $vr5,    $vr29, $vr24
+    VFMADD      $vr25,    $vr4,    $vr29, $vr25
+
+    vand.v    $vr20,     $vr25,    $vr25
+    vpermi.w  $vr20,     $vr24,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w $vr20,     $vr20,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    $vr18,     $vr20,    $vr20
+
+    vand.v    $vr21,     $vr25,    $vr25
+    vpermi.w  $vr21,     $vr24,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w $vr21,     $vr21,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    $vr19,     $vr21,    $vr21
+
+    vand.v    $vr16,     $vr26,    $vr26 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    $vr17,     $vr27,    $vr27 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  $vr20,     $vr16,     0x44 //c0: 0 1 2 3
+    vpermi.w  $vr18,     $vr16,     0xee //c1: 0 1 2 3
+    vpermi.w  $vr21,     $vr17,     0x44 //c2: 0 1 2 3
+    vpermi.w  $vr19,     $vr17,     0xee //c3: 0 1 2 3
+
+    vst       $vr20,     C0,    0x00
+    vst       $vr18,     C1,    0x00
+    vst       $vr21,     C2,    0x00
+    vst       $vr19,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+    MTC        c31,    $r0
+    MTC        c32,    $r0
+    MTC        c41,    $r0
+    MTC        c42,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+    LD         b5,     B0,    0x10        //b2r
+    LD         b6,     B0,    0x14        //b2i
+    LD         b7,     B0,    0x18        //b3r
+    LD         b8,     B0,    0x1c        //b3i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    MADD1      c31,    a1,    b5,     c31  //res20r
+    MADD2      c32,    a2,    b5,     c32  //res20i
+    MADD3      c31,    a2,    b6,     c31
+    MADD4      c32,    a1,    b6,     c32
+
+    MADD1      c41,    a1,    b7,     c41  //res30r
+    MADD2      c42,    a2,    b7,     c42  //res30i
+    MADD3      c41,    a2,    b8,     c41
+    MADD4      c42,    a1,    b8,     c42
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c21,   ALPHA_R
+    MUL       a6,     c22,   ALPHA_R
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MUL       a5,     c31,   ALPHA_R
+    MUL       a6,     c32,   ALPHA_R
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MUL       a5,     c41,   ALPHA_R
+    MUL       a6,     c42,   ALPHA_R
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#else
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c21,   ALPHA_R, a5
+    MADD       a6,     c22,   ALPHA_R, a6
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MADD       a5,     c31,   ALPHA_R, a5
+    MADD       a6,     c32,   ALPHA_R, a6
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MADD       a5,     c41,   ALPHA_R, a5
+    MADD       a6,     c42,   ALPHA_R, a6
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     4  //bm/16
+    beq        I,      T0,    .L21
+
+.L201:  /* for (i=0; i<bm/16; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   16
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L203
+    blt        TL,     L,     .L203
+
+.L202:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    vld       $vr18,     B0,    0x00  //b0ri b1ri
+
+    xvpackev.w D4,     D0,    D0  //a0rr a1rr a2rr a3rr
+    xvpackod.w D5,     D0,    D0  //a0ii a1ii a2ii a3ii
+
+    vpackev.w $vr22,     $vr18,    $vr18 //b0rr b1rr
+    vpackod.w $vr23,     $vr18,    $vr18 //b0ii b1ii
+
+    vpermi.w  $vr22,     $vr22,    0x88 //b0r b1r b0r b1r
+    vpermi.w  $vr23,     $vr23,    0x88 //b0i b1i b0i b1i
+
+    xvpermi.d  D6,     D6,    0x00 //b0r b1r b0r b1r b0r b1r b0r b1r
+    xvpermi.d  D7,     D7,    0x00 //b0i b1i b0i b1i b0i b1i b0i b1i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r 02r 12r 03r 13r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i 02i 12i 03i 13i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvld       D0,     A0,    0x20
+
+    xvpackev.w D4,     D0,    D0
+    xvpackod.w D5,     D0,    D0
+
+    XVMADD1    U2,     D4,    D6,     U2  //04r 14r 05r 15r 06r 16r 07r 17r
+    XVMADD2    U3,     D5,    D6,     U3  //04i 14i 05i 15i 06i 16i 07i 17i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x40
+
+    xvpackev.w D4,     D0,    D0
+    xvpackod.w D5,     D0,    D0
+
+    XVMADD1    U4,     D4,    D6,     U4  //08r 18r 09r 19r 0ar 1ar 0br 1br
+    XVMADD2    U5,     D5,    D6,     U5  //08i 18i 09i 19i 0ai 1ai 0bi 1bi
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvld       D0,     A0,    0x60
+
+    xvpackev.w D4,     D0,    D0
+    xvpackod.w D5,     D0,    D0
+
+    XVMADD1    U6,     D4,    D6,     U6  //0cr 1cr 0dr 1dr 0er 1er 0fr 1fr
+    XVMADD2    U7,     D5,    D6,     U7  //0ci 1ci 0di 1di 0ei 1ei 0fi 1fi
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L202
+
+.L203:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    xvfmul.s      D4,    U0,    VALPHAR
+    xvfmul.s      D5,    U1,    VALPHAR
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res04 res14 res05 res15 res06 res16 res07 res17
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    xvfmul.s      D4,    U2,    VALPHAR
+    xvfmul.s      D5,    U3,    VALPHAR
+    XVNMSUB      D4,    U3,    VALPHAI, D4
+    XVFMADD      D5,    U2,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res08 res18 res09 res19 res0a res1a res0b res1b
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    xvfmul.s      D4,    U4,    VALPHAR
+    xvfmul.s      D5,    U5,    VALPHAR
+    XVNMSUB      D4,    U5,    VALPHAI, D4
+    XVFMADD      D5,    U4,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res0c res1c res0d res1d res0e res1e res0f res1f
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    xvfmul.s      D4,    U6,    VALPHAR
+    xvfmul.s      D5,    U7,    VALPHAR
+    XVNMSUB      D4,    U7,    VALPHAI, D4
+    XVFMADD      D5,    U6,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#else
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    XVFMADD      D4,    U0,    VALPHAR, D4
+    XVFMADD      D5,    U1,    VALPHAR, D5
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res04 res14 res05 res15 res06 res16 res07 res17
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    XVFMADD      D4,    U2,    VALPHAR, D4
+    XVFMADD      D5,    U3,    VALPHAR, D5
+    XVNMSUB      D4,    U3,    VALPHAI, D4
+    XVFMADD      D5,    U2,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res08 res18 res09 res19 res0a res1a res0b res1b
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    XVFMADD      D4,    U4,    VALPHAR, D4
+    XVFMADD      D5,    U5,    VALPHAR, D5
+    XVNMSUB      D4,    U5,    VALPHAI, D4
+    XVFMADD      D5,    U4,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res0c res1c res0d res1d res0e res1e res0f res1f
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    XVFMADD      D4,    U6,    VALPHAR, D4
+    XVFMADD      D5,    U7,    VALPHAR, D5
+    XVNMSUB      D4,    U7,    VALPHAI, D4
+    XVFMADD      D5,    U6,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -16
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   16
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L201
+
+.L21:  /* if (bm & 8) */
+    move       I,      $r0
+    andi       T1,     M,     8    //bm&8
+    beq        I,      T1,    .L24
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    vld       $vr18,     B0,    0x00  //b0ri b1ri
+
+    xvpackev.w D4,     D0,    D0  //a0rr a1rr a2rr a3rr
+    xvpackod.w D5,     D0,    D0  //a0ii a1ii a2ii a3ii
+
+    vpackev.w $vr22,     $vr18,    $vr18 //b0rr b1rr
+    vpackod.w $vr23,     $vr18,    $vr18 //b0ii b1ii
+
+    vpermi.w  $vr22,     $vr22,    0x88 //b0r b1r b0r b1r
+    vpermi.w  $vr23,     $vr23,    0x88 //b0i b1i b0i b1i
+
+    xvpermi.d  D6,     D6,    0x00 //b0r b1r b0r b1r b0r b1r b0r b1r
+    xvpermi.d  D7,     D7,    0x00 //b0i b1i b0i b1i b0i b1i b0i b1i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r 02r 12r 03r 13r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i 02i 12i 03i 13i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvld       D0,     A0,    0x20
+
+    xvpackev.w D4,     D0,    D0
+    xvpackod.w D5,     D0,    D0
+
+    XVMADD1    U2,     D4,    D6,     U2  //04r 14r 05r 15r 06r 16r 07r 17r
+    XVMADD2    U3,     D5,    D6,     U3  //04i 14i 05i 15i 06i 16i 07i 17i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    xvfmul.s      D4,    U0,    VALPHAR
+    xvfmul.s      D5,    U1,    VALPHAR
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res04 res14 res05 res15 res06 res16 res07 res17
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    xvfmul.s      D4,    U2,    VALPHAR
+    xvfmul.s      D5,    U3,    VALPHAR
+    XVNMSUB      D4,    U3,    VALPHAI, D4
+    XVFMADD      D5,    U2,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#else
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    XVFMADD      D4,    U0,    VALPHAR, D4
+    XVFMADD      D5,    U1,    VALPHAR, D5
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    //res04 res14 res05 res15 res06 res16 res07 res17
+    xvld       D0,     C0,    0x00
+    xvld       D1,     C1,    0x00
+
+    xvpackev.w D4,     D1,    D0
+    xvpackod.w D5,     D1,    D0
+
+    XVFMADD      D4,    U2,    VALPHAR, D4
+    XVFMADD      D5,    U3,    VALPHAR, D5
+    XVNMSUB      D4,    U3,    VALPHAI, D4
+    XVFMADD      D5,    U2,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4
+    xvpackod.w D1,     D5,    D4
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L24:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L280
+
+.L25:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    vld       $vr18,     B0,    0x00  //b0ri b1ri
+
+    xvpackev.w D4,     D0,    D0  //a0rr a1rr a2rr a3rr
+    xvpackod.w D5,     D0,    D0  //a0ii a1ii a2ii a3ii
+
+    vpackev.w $vr22,     $vr18,    $vr18 //b0rr b1rr
+    vpackod.w $vr23,     $vr18,    $vr18 //b0ii b1ii
+
+    vpermi.w  $vr22,     $vr22,    0x88 //b0r b1r b0r b1r
+    vpermi.w  $vr23,     $vr23,    0x88 //b0i b1i b0i b1i
+
+    xvpermi.d  D6,     D6,    0x00 //b0r b1r b0r b1r b0r b1r b0r b1r
+    xvpermi.d  D7,     D7,    0x00 //b0i b1i b0i b1i b0i b1i b0i b1i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r 02r 12r 03r 13r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i 02i 12i 03i 13i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    xvfmul.s      D4,    U0,    VALPHAR
+    xvfmul.s      D5,    U1,    VALPHAR
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#else
+    //res00 res10 res01 res11 res02 res12 res03 res13
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3 4 5 6 7
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3 4 5 6 7
+
+    xvpackev.w D4,     D1,    D0   //c0[0] c1[0] c0[2] c1[2] c0[4] c1[4] c0[6] c1[6]
+    xvpackod.w D5,     D1,    D0   //c0[1] c1[1] c0[3] c1[3] c0[5] c1[5] c0[7] c1[7]
+
+    XVFMADD      D4,    U0,    VALPHAR, D4
+    XVFMADD      D5,    U1,    VALPHAR, D5
+    XVNMSUB      D4,    U1,    VALPHAI, D4
+    XVFMADD      D5,    U0,    VALPHAI, D5
+
+    xvpackev.w D0,     D5,    D4   //c0: 0 1 2 3 4 5 6 7
+    xvpackod.w D1,     D5,    D4   //c1: 0 1 2 3 4 5 6 7
+
+    xvst       D0,     C0,    0x00
+    xvst       D1,     C1,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr2,     $vr2,   $vr2
+    vxor.v    $vr3,     $vr3,   $vr3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri a1ri
+    vld       $vr18,     B0,    0x00  // b0ri b1ri
+
+    vpackev.w $vr20,     $vr16,    $vr16  //a0rr a1rr
+    vpackod.w $vr21,     $vr16,    $vr16  //a0ii a1ii
+
+    vpackev.w $vr22,     $vr18,    $vr18 //b0rr b1rr
+    vpackod.w $vr23,     $vr18,    $vr18 //b0ii b1ii
+
+    vpermi.w  $vr22,     $vr22,    0x88 //b0r b1r b0r b1r
+    vpermi.w  $vr23,     $vr23,    0x88 //b0i b1i b0i b1i
+
+    VMADD1    $vr2,     $vr20,    $vr22,     $vr2  //00r 10r 01r 11r
+    VMADD2    $vr3,     $vr21,    $vr22,     $vr3  //00i 10i 01i 11i
+    VMADD3    $vr2,     $vr21,    $vr23,     $vr2
+    VMADD4    $vr3,     $vr20,    $vr23,     $vr3
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w $vr18,     $vr17,    $vr16  //c0[0] c1[0] c0[2] c1[2]
+    vpackod.w $vr19,     $vr17,    $vr16  //c0[1] c1[1] c0[3] c1[3]
+
+    vfmul.s      $vr18,    $vr2,    $vr28
+    vfmul.s      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.w $vr20,     $vr19,    $vr18
+    vpackod.w $vr21,     $vr19,    $vr18
+
+    vst       $vr20,     C0,    0x00 //c0: 0 1 2 3
+    vst       $vr21,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w $vr18,     $vr17,    $vr16  //c0[0] c1[0] c0[2] c1[2]
+    vpackod.w $vr19,     $vr17,    $vr16  //c0[1] c1[1] c0[3] c1[3]
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.w $vr20,     $vr19,    $vr18
+    vpackod.w $vr21,     $vr19,    $vr18
+
+    vst       $vr20,     C0,    0x00 //c0: 0 1 2 3
+    vst       $vr21,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c21,   ALPHA_R
+    MUL       a6,     c22,   ALPHA_R
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#else
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c21,   ALPHA_R, a5
+    MADD       a6,     c22,   ALPHA_R, a6
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     4  //bm/16
+    beq        I,      T0,    .L31
+
+.L301:  /* for (i=0; i<bm/16; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x03
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   16
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L303
+    blt        TL,     L,     .L303
+
+.L302:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    xvld       D1,     A0,    0x20  //a4ri a5ri a6ri a7ri
+
+    xvldrepl.w D2,     B0,    0x00 //b0r
+    xvldrepl.w D3,     B0,    0x04 //b0i
+
+    xvpackev.w D4,     D1,    D0    //a0r a4r a1r a5r a2r a6r a3r a7r
+    xvpermi.w  D4,     D4,    0xd8  //a0r a1r a4r a5r a2r a3r a6r a7r
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r a4r a5r a6r a7r
+
+    xvpackod.w D5,     D1,    D0    //a0i a4i a1i a5i a2i a6i a3i a7i
+    xvpermi.w  D5,     D5,    0xd8  //a0i a1i a4i a5i a2i a3i a6i a7i
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i a4i a5i a6i a7i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r 04r 05r 06r 07r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i 04i 05i 06i 07i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    xvld       D0,     A0,    0x40  //a8ri a9ri a10ri a11ri
+    xvld       D1,     A0,    0x60  //a12ri a13ri a14ri a15ri
+
+    xvpackev.w D4,     D1,    D0
+    xvpermi.w  D4,     D4,    0xd8
+    xvpermi.d  D4,     D4,    0xd8  //a8r a9r a10r a11r a12r a13r a14r a15r
+
+    xvpackod.w D5,     D1,    D0
+    xvpermi.w  D5,     D5,    0xd8
+    xvpermi.d  D5,     D5,    0xd8  //a8i a9i a10i a11i a12i a13i a14i a15i
+
+    XVMADD1    U2,     D4,    D2,     U2  //08r 09r 0ar 0br 0cr 0dr 0er 0fr
+    XVMADD2    U3,     D5,    D2,     U3  //08i 09i 0ai 0bi 0ci 0di 0ei 0fi
+    XVMADD3    U2,     D5,    D3,     U2
+    XVMADD4    U3,     D4,    D3,     U3
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L302
+
+.L303:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03 res04 res05 res06 res07
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    xvfmul.s      D2,    U0,    VALPHAR
+    xvfmul.s      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+
+    //res08 res09 res0a res0b res0c res0d res0e res0f
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    xvfmul.s      D2,    U2,    VALPHAR
+    xvfmul.s      D3,    U3,    VALPHAR
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03 res04 res05 res06 res07
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+
+    //res08 res09 res0a res0b res0c res0d res0e res0f
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    XVFMADD      D2,    U2,    VALPHAR, D2
+    XVFMADD      D3,    U3,    VALPHAR, D3
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -16
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   16
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L301
+
+.L31:  /* if ( bm & 8 )*/
+    move       I,      $r0
+    andi       T1,     M,     8    //bm&8
+    beq        I,      T1,    .L34
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x03
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  //a0ri a1ri a2ri a3ri
+    xvld       D1,     A0,    0x20  //a4ri a5ri a6ri a7ri
+
+    xvldrepl.w D2,     B0,    0x00 //b0r
+    xvldrepl.w D3,     B0,    0x04 //b0i
+
+    xvpackev.w D4,     D1,    D0    //a0r a4r a1r a5r a2r a6r a3r a7r
+    xvpermi.w  D4,     D4,    0xd8  //a0r a1r a4r a5r a2r a3r a6r a7r
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r a4r a5r a6r a7r
+
+    xvpackod.w D5,     D1,    D0    //a0i a4i a1i a5i a2i a6i a3i a7i
+    xvpermi.w  D5,     D5,    0xd8  //a0i a1i a4i a5i a2i a3i a6i a7i
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i a4i a5i a6i a7i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r 04r 05r 06r 07r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i 04i 05i 06i 07i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03 res04 res05 res06 res07
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    xvfmul.s      D2,    U0,    VALPHAR
+    xvfmul.s      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03 res04 res05 res06 res07
+    xvld       D0,     C0,    0x00 //c0:0 1 2 3 4 5 6 7
+    xvld       D1,     C0,    0x20 //c0:8 9 10 11 12 13 14 15
+
+    xvpackev.w D2,     D1,    D0  //0 8 2 10 4 12 6 14
+    xvpermi.w  D2,     D2,    0xd8  //0 2 8 10 4 6 12 14
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6 8 10 12 14
+
+    xvpackod.w D3,     D1,    D0
+    xvpermi.w  D3,     D3,    0xd8
+    xvpermi.d  D3,     D3,    0xd8 //1 3 5 7 9 11 13 15
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6 8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //0 2 4 6 1 3 5 7
+    xvpermi.d  D4,     D4,   0xd8 //0 2 1 3 4 6 5 7
+    xvpermi.w  D4,     D4,   0xd8 //0 1 2 3 4 5 6 7
+
+    xvand.v    D5,     D3,   D3
+    xvpermi.q  D5,     D2,   0x31
+    xvpermi.d  D5,     D5,   0xd8
+    xvpermi.w  D5,     D5,   0xd8
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L34:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L38
+
+.L35:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr2,     $vr2,   $vr2
+    vxor.v    $vr3,     $vr3,   $vr3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L37
+    blt        TL,     L,     .L37
+
+.L36:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri a1ri
+    vld       $vr17,     A0,    0x10  // a2ri a3ri
+
+    vldrepl.w $vr18,     B0,    0x00 //b0r
+    vldrepl.w $vr19,     B0,    0x04 //b0i
+
+    vpackev.w $vr20,     $vr17,    $vr16
+    vshuf4i.w  $vr20,     $vr20,    0xd8  //a0r a1r a2r a3r
+
+    vpackod.w $vr21,     $vr17,    $vr16
+    vshuf4i.w  $vr21,     $vr21,    0xd8  //a0i a1i a2i a3i
+
+    VMADD1    $vr2,     $vr20,    $vr18,     $vr2  //00r 01r 02r 03r
+    VMADD2    $vr3,     $vr21,    $vr18,     $vr3  //00i 01i 02i 03i
+    VMADD3    $vr2,     $vr21,    $vr19,     $vr2
+    VMADD4    $vr3,     $vr20,    $vr19,     $vr3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L36
+
+.L37:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w $vr18,     $vr17,    $vr16
+    vshuf4i.w  $vr18,     $vr18,    0xd8  //0 2 4 6
+    vpackod.w $vr19,     $vr17,    $vr16
+    vshuf4i.w  $vr19,     $vr19,    0xd8  //1 3 5 7
+
+    vfmul.s      $vr18,    $vr2,    $vr28
+    vfmul.s      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vand.v    $vr20,     $vr19,   $vr19  //1 3 5 7
+    vpermi.w  $vr20,     $vr18,   0x44 //0 2 1 3
+    vshuf4i.w  $vr20,     $vr20,   0xd8 //0 1 2 3
+
+    vand.v    $vr21,     $vr19,   $vr19  //1 3 5 7
+    vpermi.w  $vr21,     $vr18,   0xee //4 6 5 7
+    vshuf4i.w  $vr21,     $vr21,   0xd8 //4 5 6 7
+
+    vst       $vr20,     C0,    0x00
+    vst       $vr21,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01 res02 res03
+    vld       $vr16,     C0,    0x00 //c0: 0 1 2 3
+    vld       $vr17,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w $vr18,     $vr17,    $vr16
+    vshuf4i.w  $vr18,     $vr18,    0xd8  //0 2 4 6
+    vpackod.w $vr19,     $vr17,    $vr16
+    vshuf4i.w  $vr19,     $vr19,    0xd8  //1 3 5 7
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vand.v    $vr20,     $vr19,   $vr19  //1 3 5 7
+    vpermi.w  $vr20,     $vr18,   0x44 //0 2 1 3
+    vshuf4i.w  $vr20,     $vr20,   0xd8 //0 1 2 3
+
+    vand.v    $vr21,     $vr19,   $vr19  //1 3 5 7
+    vpermi.w  $vr21,     $vr18,   0xee //4 6 5 7
+    vshuf4i.w  $vr21,     $vr21,   0xd8 //4 5 6 7
+
+    vst       $vr20,     C0,    0x00
+    vst       $vr21,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+    LD         a3,     A0,    0x08        //a1r
+    LD         a4,     A0,    0x0c        //a1i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a3,    b1,     c21  //res01r
+    MADD2      c22,    a4,    b1,     c22  //res01i
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C0,    0x08    //C0[2]
+    LD         a6,     C0,    0x0c    //C0[3]
+
+    MUL       a5,     c21,   ALPHA_R
+    MUL       a6,     c22,   ALPHA_R
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x08
+    ST         a6,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#else
+    //res00 res01
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C0,    0x08    //C0[2]
+    LD         a6,     C0,    0x0c    //C0[3]
+
+    MADD       a5,     c21,   ALPHA_R, a5
+    MADD       a6,     c22,   ALPHA_R, a6
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x08
+    ST         a6,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x04
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x04
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S
new file mode 100644
index 000000000..e07f7dc64
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S
@@ -0,0 +1,857 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define VALPHAR    $xr28
+#define VALPHAI    $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFADD
+#define    XVFADD3       XVFSUB
+#define    XVFADD4       XVFADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFADD
+#define    XVFADD3       XVFADD
+#define    XVFADD4       XVFSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFSUB
+#define    XVFADD3       XVFADD
+#define    XVFADD4       XVFADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFSUB
+#define    XVFADD3       XVFSUB
+#define    XVFADD4       XVFSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.w  VALPHAR, $sp, 112
+    xvldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    xvld       D0,     A0,    0x00  //a 0-7
+    xvld       D1,     A0,    0x20  //a 8-15
+    xvld       D2,     B0,    0x00  //b 0-7
+    xvld       D3,     B0,    0x20  //b 8-15
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02  //a 0 1 2 3 8 9 10 11
+    xvand.v    D5,     D4,    D4
+    xvshuf4i.w D4,     D4,    0x88  //a 0 2 0 2 8 10 8 10
+    xvshuf4i.w D5,     D5,    0xdd  //a 1 3 1 3 9 11 9 11
+
+    xvand.v    D6,     D1,    D1
+    xvpermi.q  D6,     D0,    0x31  //a 4 5 6 7 12 13 14 15
+    xvand.v    D7,     D6,    D6
+    xvshuf4i.w D6,     D6,    0x88  //a 4 6 4 6 12 14 12 14
+    xvshuf4i.w D7,     D7,    0xdd  //a 5 7 5 7 13 15 13 15
+
+    xvand.v    D8,     D2,    D2
+    xvpermi.q  D8,     D3,    0x02  //b 0 1 2 3 8 9 10 11
+    xvand.v    D9,     D8,    D8
+    xvshuf4i.w D8,     D8,    0xa0  //b 0 0 2 2 8 8 10 10
+    xvshuf4i.w D9,     D9,    0xf5  //a 1 1 3 3 9 9 11 11
+
+    xvand.v    D10,     D3,    D3
+    xvpermi.q  D10,     D2,    0x31  //b 4 5 6 7 12 13 14 15
+    xvand.v    D11,     D10,    D10
+    xvshuf4i.w D10,     D10,    0xa0  //b 4 4 6 6 12 12 14 14
+    xvshuf4i.w D11,     D11,    0xf5  //a 5 5 7 7 13 13 15 15
+
+    XVMADD1      U0,     D4,    D8,     U0  //res0 2 4 6 0 2 4 6
+    XVMADD2      U1,     D5,    D8,     U1  //res1 3 4 7 1 3 4 7
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD1      U0,     D4,    D8,     U0
+    XVMADD2      U1,     D5,    D8,     U1
+
+    XVMADD3      U0,     D5,    D9,     U0
+    XVMADD4      U1,     D4,    D9,     U1
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD3      U0,     D5,    D9,     U0
+    XVMADD4      U1,     D4,    D9,     U1
+
+    XVMADD1      U0,     D6,    D10,     U0  //res0 2 4 6 0 2 4 6
+    XVMADD2      U1,     D7,    D10,     U1  //res1 3 4 7 1 3 4 7
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD1      U0,     D6,    D10,     U0
+    XVMADD2      U1,     D7,    D10,     U1
+
+    XVMADD3      U0,     D7,    D11,     U0
+    XVMADD4      U1,     D6,    D11,     U1
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD3      U0,     D7,    D11,     U0
+    XVMADD4      U1,     D6,    D11,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    vld       $vr16,     A0,    0x00  //a0 a1 a2 a3
+    vld       $vr17,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w $vr20,     $vr17,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w $vr21,     $vr17,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  $vr18,     $vr16,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  $vr19,     $vr16,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      $vr30,     $vr18,    $vr20,     $vr30  //res0 2 4 6
+    VMADD2      $vr31,     $vr19,    $vr20,     $vr31  //res1 3 5 7
+    VMADD3      $vr30,     $vr19,    $vr21,     $vr30
+    VMADD4      $vr31,     $vr18,    $vr21,     $vr31
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    vld       $vr8,     C0,    0x00  //0 1 2 3
+    vld       $vr9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w $vr10,    $vr9,    $vr8    //0 4 2 6
+    vpermi.w  $vr10,    $vr10,   0xd8    //0 2 4 6
+
+    vpackod.w $vr11,    $vr9,    $vr8    //1 5 3 7
+    vpermi.w  $vr11,    $vr11,   0xd8    //1 3 5 7
+
+    vfmul.s      $vr10,    $vr30,    $vr28
+    vfmul.s      $vr11,    $vr31,    $vr28
+    VNMSUB    $vr10,    $vr31,    $vr29, $vr10
+    VFMADD     $vr11,    $vr30,    $vr29, $vr11
+
+    vilvl.w   $vr8,     $vr11,   $vr10  //0 1 2 3
+
+    vilvh.w   $vr9,     $vr11,   $vr10  //4 5 6 7
+
+    vst       $vr8,     C0,    0x00
+    vst       $vr9,     C1,    0x00
+#else
+    vld       $vr8,     C0,    0x00  //0 1 2 3
+    vld       $vr9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w $vr10,    $vr9,    $vr8    //0 4 2 6
+    vpermi.w  $vr10,    $vr10,   0xd8    //0 2 4 6
+
+    vpackod.w $vr11,    $vr9,    $vr8    //1 5 3 7
+    vpermi.w  $vr11,    $vr11,   0xd8    //1 3 5 7
+
+    VFMADD      $vr10,    $vr30,    $vr28, $vr10
+    VFMADD      $vr11,    $vr31,    $vr28, $vr11
+    VNMSUB     $vr10,    $vr31,    $vr29, $vr10
+    VFMADD      $vr11,    $vr30,    $vr29, $vr11
+
+    vilvl.w   $vr8,     $vr11,   $vr10  //0 1 2 3
+
+    vilvh.w   $vr9,     $vr11,   $vr10  //4 5 6 7
+
+    vst       $vr8,     C0,    0x00
+    vst       $vr9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x08        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x0c        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x04
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x03
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x08        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x0c        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x08
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x0c
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C1[2]
+    LD         a8,     C0,    0x0c    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x03
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_kernel_2x2_lsx.S b/kernel/loongarch64/cgemm_kernel_2x2_lsx.S
new file mode 100644
index 000000000..ebacf4e93
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_2x2_lsx.S
@@ -0,0 +1,812 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define VALPHAR    $vr28
+#define VALPHAI    $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.w  VALPHAR, $sp, 112
+    vldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    vld       D0,     A0,    0x00  //a0 a1 a2 a3
+    vld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x10  //a0 a1 a2 a3
+    vld       D1,     B0,    0x10  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x20  //a0 a1 a2 a3
+    vld       D1,     B0,    0x20  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x30  //a0 a1 a2 a3
+    vld       D1,     B0,    0x30  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    vld       D0,     A0,    0x00  //a0 a1 a2 a3
+    vld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 5 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    vld       U8,     C0,    0x00  //0 1 2 3
+    vld       U9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w U10,    U9,    U8    //0 4 2 6
+    vpermi.w  U10,    U10,   0xd8  //0 2 4 6
+
+    vpackod.w U11,    U9,    U8    //1 5 3 7
+    vpermi.w  U11,    U11,   0xd8  //1 3 5 7
+
+    vfmul.s      U10,    U0,    VALPHAR
+    vfmul.s      U11,    U1,    VALPHAR
+    VNMSUB     U10,    U1,    VALPHAI, U10
+    VFMADD      U11,    U0,    VALPHAI, U11
+
+    vilvl.w   U8,     U11,   U10  //0 1 2 3
+
+    vilvh.w   U9,     U11,   U10  //4 5 6 7
+
+    vst       U8,     C0,    0x00
+    vst       U9,     C1,    0x00
+#else
+    vld       U8,     C0,    0x00  //0 1 2 3
+    vld       U9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w U10,    U9,    U8    //0 4 2 6
+    vpermi.w  U10,    U10,   0xd8  //0 2 4 6
+
+    vpackod.w U11,    U9,    U8    //1 5 3 7
+    vpermi.w  U11,    U11,   0xd8  //1 3 5 7
+
+    VFMADD      U10,    U0,    VALPHAR, U10
+    VFMADD      U11,    U1,    VALPHAR, U11
+    VNMSUB     U10,    U1,    VALPHAI, U10
+    VFMADD      U11,    U0,    VALPHAI, U11
+
+    vilvl.w   U8,     U11,   U10  //0 1 2 3
+
+    vilvh.w   U9,     U11,   U10  //4 5 6 7
+
+    vst       U8,     C0,    0x00
+    vst       U9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x08        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x0c        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x04
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x03
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x08        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x0c        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x08
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x0c
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C1[2]
+    LD         a8,     C0,    0x0c    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x03
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_kernel_8x4_lsx.S b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S
new file mode 100644
index 000000000..1e9fd8524
--- /dev/null
+++ b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S
@@ -0,0 +1,3313 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LSX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define D12    $vr28
+#define D13    $vr29
+#define VALPHAR $vr28
+#define VALPHAI $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.w  VALPHAR, $sp, 112
+    vldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+    vxor.v    U8,     U8,   U8
+    vxor.v    U9,     U9,   U9
+    vxor.v    U10,    U10,  U10
+    vxor.v    U11,    U11,  U11
+    vxor.v    U12,    U12,  U12
+    vxor.v    U13,    U13,  U13
+    vxor.v    U14,    U14,  U14
+    vxor.v    U15,    U15,  U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+    vld       D3,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a0r
+    vshuf4i.w  D5,     D0,    0x55  //a0i
+
+    vpackev.w  D6,     D3,    D2
+    vshuf4i.w  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  D7,     D3,    D2
+    vshuf4i.w  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vshuf4i.w  D4,     D0,    0xaa  //a1r
+    vshuf4i.w  D5,     D0,    0xff  //a1i
+
+    VMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    VMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a2r
+    vshuf4i.w  D5,     D0,    0x55  //a2i
+
+    VMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    VMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    VMADD3    U4,     D5,    D7,     U4
+    VMADD4    U5,     D4,    D7,     U5
+
+    vshuf4i.w  D4,     D0,    0xaa  //a3r
+    vshuf4i.w  D5,     D0,    0xff  //a3i
+
+    VMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    VMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    VMADD3    U6,     D5,    D7,     U6
+    VMADD4    U7,     D4,    D7,     U7
+
+    vld       D0,     A0,    0x20  // a4ri a5ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a4r
+    vshuf4i.w  D5,     D0,    0x55  //a4i
+
+    VMADD1    U8,     D4,    D6,     U8  //04r 14r 24r 34r
+    VMADD2    U9,     D5,    D6,     U9  //04i 14i 24i 34i
+    VMADD3    U8,     D5,    D7,     U8
+    VMADD4    U9,     D4,    D7,     U9
+
+    vshuf4i.w  D4,     D0,    0xaa  //a5r
+    vshuf4i.w  D5,     D0,    0xff  //a5i
+
+    VMADD1    U10,     D4,    D6,     U10  //05r 15r 25r 35r
+    VMADD2    U11,     D5,    D6,     U11  //05i 15i 25i 35i
+    VMADD3    U10,     D5,    D7,     U10
+    VMADD4    U11,     D4,    D7,     U11
+
+    vld       D0,     A0,    0x30  // a6ri a7ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a6r
+    vshuf4i.w  D5,     D0,    0x55  //a6i
+
+    VMADD1    U12,     D4,    D6,     U12  //06r 16r 26r 36r
+    VMADD2    U13,     D5,    D6,     U13  //06i 16i 26i 36i
+    VMADD3    U12,     D5,    D7,     U12
+    VMADD4    U13,     D4,    D7,     U13
+
+    vshuf4i.w  D4,     D0,    0xaa  //a5r
+    vshuf4i.w  D5,     D0,    0xff  //a5i
+
+    VMADD1    U14,     D4,    D6,     U14  //07r 17r 27r 37r
+    VMADD2    U15,     D5,    D6,     U15  //07i 17i 27i 37i
+    VMADD3    U14,     D5,    D7,     U14
+    VMADD4    U15,     D4,    D7,     U15
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U0,    VALPHAR
+    vfmul.s      D9,    U1,    VALPHAR
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U2,    VALPHAR
+    vfmul.s      D9,    U3,    VALPHAR
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U4,    VALPHAR
+    vfmul.s      D9,    U5,    VALPHAR
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U6,    VALPHAR
+    vfmul.s      D9,    U7,    VALPHAR
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res04 res14 res24 res34
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U8,    VALPHAR
+    vfmul.s      D9,    U9,    VALPHAR
+    VNMSUB      D8,    U9,    VALPHAI, D8
+    VFMADD      D9,    U8,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U10,    VALPHAR
+    vfmul.s      D9,    U11,    VALPHAR
+    VNMSUB      D8,    U11,    VALPHAI, D8
+    VFMADD      D9,    U10,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res06 res16 res26 res36
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U12,    VALPHAR
+    vfmul.s      D9,    U13,    VALPHAR
+    VNMSUB      D8,    U13,    VALPHAI, D8
+    VFMADD      D9,    U12,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U14,    VALPHAR
+    vfmul.s      D9,    U15,    VALPHAR
+    VNMSUB      D8,    U15,    VALPHAI, D8
+    VFMADD      D9,    U14,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U0,    VALPHAR, D8
+    VFMADD      D9,    U1,    VALPHAR, D9
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U2,    VALPHAR, D8
+    VFMADD      D9,    U3,    VALPHAR, D9
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U4,    VALPHAR, D8
+    VFMADD      D9,    U5,    VALPHAR, D9
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U6,    VALPHAR, D8
+    VFMADD      D9,    U7,    VALPHAR, D9
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res04 res14 res24 res34
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U8,    VALPHAR, D8
+    VFMADD      D9,    U9,    VALPHAR, D9
+    VNMSUB      D8,    U9,    VALPHAI, D8
+    VFMADD      D9,    U8,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U10,    VALPHAR, D8
+    VFMADD      D9,    U11,    VALPHAR, D9
+    VNMSUB      D8,    U11,    VALPHAI, D8
+    VFMADD      D9,    U10,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res06 res16 res26 res36
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U12,    VALPHAR, D8
+    VFMADD      D9,    U13,    VALPHAR, D9
+    VNMSUB      D8,    U13,    VALPHAI, D8
+    VFMADD      D9,    U12,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U14,    VALPHAR, D8
+    VFMADD      D9,    U15,    VALPHAR, D9
+    VNMSUB      D8,    U15,    VALPHAI, D8
+    VFMADD      D9,    U14,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     4
+    beq        I,      T0,    .L18
+
+.L15:  /* if (bm & 4) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+    vld       D3,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a0r
+    vshuf4i.w  D5,     D0,    0x55  //a0i
+
+    vpackev.w  D6,     D3,    D2
+    vshuf4i.w  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  D7,     D3,    D2
+    vshuf4i.w  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vshuf4i.w  D4,     D0,    0xaa  //a1r
+    vshuf4i.w  D5,     D0,    0xff  //a1i
+
+    VMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    VMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a2r
+    vshuf4i.w  D5,     D0,    0x55  //a2i
+
+    VMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    VMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    VMADD3    U4,     D5,    D7,     U4
+    VMADD4    U5,     D4,    D7,     U5
+
+    vshuf4i.w  D4,     D0,    0xaa  //a3r
+    vshuf4i.w  D5,     D0,    0xff  //a3i
+
+    VMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    VMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    VMADD3    U6,     D5,    D7,     U6
+    VMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U0,    VALPHAR
+    vfmul.s      D9,    U1,    VALPHAR
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U2,    VALPHAR
+    vfmul.s      D9,    U3,    VALPHAR
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U4,    VALPHAR
+    vfmul.s      D9,    U5,    VALPHAR
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U6,    VALPHAR
+    vfmul.s      D9,    U7,    VALPHAR
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U0,    VALPHAR, D8
+    VFMADD      D9,    U1,    VALPHAR, D9
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vst       VALPHAR,     C0,    0x00
+    vst       VALPHAI,     C1,    0x00
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U2,    VALPHAR, D8
+    VFMADD      D9,    U3,    VALPHAR, D9
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    // vst       VALPHAR,C0,    0x00
+    // LD        $f15,   C0,    0x00
+    // LD        $f15,   C0,    0x04
+    // LD        $f15,   C0,    0x08
+    // LD        $f15,   C0,    0x0c
+
+    // vst       VALPHAI,C0,    0x00
+    // LD        $f15,   C0,    0x00
+    // LD        $f15,   C0,    0x04
+    // LD        $f15,   C0,    0x08
+    // LD        $f15,   C0,    0x0c
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    // LD        $f15,   C0,    0x00
+    // LD        $f15,   C0,    0x04
+    // LD        $f15,   C0,    0x08
+    // LD        $f15,   C0,    0x0c
+
+    // LD        $f15,   C1,    0x00
+    // LD        $f15,   C1,    0x04
+    // LD        $f15,   C1,    0x08
+    // LD        $f15,   C1,    0x0c
+
+    // LD        $f15,   C2,    0x00
+    // LD        $f15,   C2,    0x04
+    // LD        $f15,   C2,    0x08
+    // LD        $f15,   C2,    0x0c
+
+    // LD        $f15,   C3,    0x00
+    // LD        $f15,   C3,    0x04
+    // LD        $f15,   C3,    0x08
+    // LD        $f15,   C3,    0x0c
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12 res22 res32
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U4,    VALPHAR, D8
+    VFMADD      D9,    U5,    VALPHAR, D9
+    VNMSUB      D8,    U5,    VALPHAI, D8
+    VFMADD      D9,    U4,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U6,    VALPHAR, D8
+    VFMADD      D9,    U7,    VALPHAR, D9
+    VNMSUB      D8,    U7,    VALPHAI, D8
+    VFMADD      D9,    U6,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+    vld       D3,     B0,    0x10  // b2ri b3ri
+
+    vshuf4i.w  D4,     D0,    0x00  //a0r
+    vshuf4i.w  D5,     D0,    0x55  //a0i
+
+    vpackev.w  D6,     D3,    D2
+    vshuf4i.w  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    vpackod.w  D7,     D3,    D2
+    vshuf4i.w  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vshuf4i.w  D4,     D0,    0xaa  //a1r
+    vshuf4i.w  D5,     D0,    0xff  //a1i
+
+    VMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    VMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    vfmul.s      D8,    U0,    VALPHAR
+    vfmul.s      D9,    U1,    VALPHAR
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    vfmul.s      D8,    U2,    VALPHAR
+    vfmul.s      D9,    U3,    VALPHAR
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+    vld       D2,     C2,    0x00 //c2: 0 1 2 3
+    vld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0x44 //c0:0 1, c1:0 1
+    vshuf4i.w  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0x44 //c2:0 1, c3:0 1
+    vshuf4i.w  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    vpermi.w  D8,     D6,    0x44 //c0[0] c1[0] c2[0] c3[0]
+    vpermi.w  D9,     D7,    0x44 //c0[1] c1[1] c2[1] c3[1]
+
+    VFMADD      D8,    U0,    VALPHAR, D8
+    VFMADD      D9,    U1,    VALPHAR, D9
+    VNMSUB      D8,    U1,    VALPHAI, D8
+    VFMADD      D9,    U0,    VALPHAI, D9
+
+    vand.v    D10,     D9,    D9 //c0[1] c1[1] c2[1] c3[1]
+    vand.v    D11,     D9,    D9 //c0[0] c1[0] c2[0] c3[0]
+
+    vpermi.w  D10,     D8,    0x44 //c0[0] c1[0] c0[1] c1[1]
+    vshuf4i.w  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    vpermi.w  D11,     D8,    0xee //c2[0] c3[0] c2[1] c3[1]
+    vshuf4i.w  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    vand.v    D4,     D1,    D1
+    vpermi.w  D4,     D0,    0xee //c0:2 3, c1:2 3
+    vshuf4i.w  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    vand.v    D5,     D3,    D3
+    vpermi.w  D5,     D2,    0xee //c2:2 3, c3:2 3
+    vshuf4i.w  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    vpermi.w  D8,     D6,    0x44 //c0[2] c1[2] c2[2] c3[2]
+    vpermi.w  D9,     D7,    0x44 //c0[3] c1[3] c2[3] c3[3]
+
+    VFMADD      D8,    U2,    VALPHAR, D8
+    VFMADD      D9,    U3,    VALPHAR, D9
+    VNMSUB      D8,    U3,    VALPHAI, D8
+    VFMADD      D9,    U2,    VALPHAI, D9
+
+    vand.v    D4,     D9,    D9
+    vpermi.w  D4,     D8,    0x44 //c0[2] c1[2] c0[3] c1[3]
+    vshuf4i.w D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+    vand.v    D2,     D4,    D4
+
+    vand.v    D5,     D9,    D9
+    vpermi.w  D5,     D8,    0xee //c2[2] c3[2] c2[3] c3[3]
+    vshuf4i.w D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+    vand.v    D3,     D5,    D5
+
+    vand.v    D0,     D10,    D10 //c0[0] c0[1] c1[0] c1[1]
+    vand.v    D1,     D11,    D11 //c2[0] c2[1] c3[0] c3[1]
+
+    vpermi.w  D4,     D0,     0x44 //c0: 0 1 2 3
+    vpermi.w  D2,     D0,     0xee //c1: 0 1 2 3
+    vpermi.w  D5,     D1,     0x44 //c2: 0 1 2 3
+    vpermi.w  D3,     D1,     0xee //c3: 0 1 2 3
+
+    vst       D4,     C0,    0x00
+    vst       D2,     C1,    0x00
+    vst       D5,     C2,    0x00
+    vst       D3,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+    MTC        c31,    $r0
+    MTC        c32,    $r0
+    MTC        c41,    $r0
+    MTC        c42,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+    LD         b5,     B0,    0x10        //b2r
+    LD         b6,     B0,    0x14        //b2i
+    LD         b7,     B0,    0x18        //b3r
+    LD         b8,     B0,    0x1c        //b3i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    MADD1      c31,    a1,    b5,     c31  //res20r
+    MADD2      c32,    a2,    b5,     c32  //res20i
+    MADD3      c31,    a2,    b6,     c31
+    MADD4      c32,    a1,    b6,     c32
+
+    MADD1      c41,    a1,    b7,     c41  //res30r
+    MADD2      c42,    a2,    b7,     c42  //res30i
+    MADD3      c41,    a2,    b8,     c41
+    MADD4      c42,    a1,    b8,     c42
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c21,   ALPHA_R
+    MUL       a6,     c22,   ALPHA_R
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MUL       a5,     c31,   ALPHA_R
+    MUL       a6,     c32,   ALPHA_R
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MUL       a5,     c41,   ALPHA_R
+    MUL       a6,     c42,   ALPHA_R
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#else
+    //res00 res10 res20 res30
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    LD         a5,     C1,    0x00    //C1[0]
+    LD         a6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c21,   ALPHA_R, a5
+    MADD       a6,     c22,   ALPHA_R, a6
+    NMSUB      a5,     c22,   ALPHA_I, a5
+    MADD       a6,     c21,   ALPHA_I, a6
+
+    ST         a5,     C1,    0x00
+    ST         a6,     C1,    0x04
+
+    LD         a5,     C2,    0x00    //C2[0]
+    LD         a6,     C2,    0x04    //C2[1]
+
+    MADD       a5,     c31,   ALPHA_R, a5
+    MADD       a6,     c32,   ALPHA_R, a6
+    NMSUB      a5,     c32,   ALPHA_I, a5
+    MADD       a6,     c31,   ALPHA_I, a6
+
+    ST         a5,     C2,    0x00
+    ST         a6,     C2,    0x04
+
+    LD         a5,     C3,    0x00    //C3[0]
+    LD         a6,     C3,    0x04    //C3[1]
+
+    MADD       a5,     c41,   ALPHA_R, a5
+    MADD       a6,     c42,   ALPHA_R, a6
+    NMSUB      a5,     c42,   ALPHA_I, a5
+    MADD       a6,     c41,   ALPHA_I, a6
+
+    ST         a5,     C3,    0x00
+    ST         a6,     C3,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+    addi.d     C2,     C2,    0x08
+    addi.d     C3,     C3,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     C3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a0rr a1rr
+    vshuf4i.w  D5,     D0,    0xf5  //a0ii a1ii
+
+    vshuf4i.w  D6,     D2,    0x88  //b0r b1r b0r b1r
+    vshuf4i.w  D7,     D2,    0xdd  //b0i b1i b0i b1i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a2rr a3rr
+    vshuf4i.w  D5,     D0,    0xf5  //a2ii a3ii
+
+    VMADD1    U2,     D4,    D6,     U2  //02r 12r 03r 13r
+    VMADD2    U3,     D5,    D6,     U3  //02i 12i 03i 13i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    vld       D0,     A0,    0x20  // a4ri a5ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a4rr a5rr
+    vshuf4i.w  D5,     D0,    0xf5  //a4ii a5ii
+
+    VMADD1    U4,     D4,    D6,     U4  //04r 14r 05r 15r
+    VMADD2    U5,     D5,    D6,     U5  //04i 14i 05i 15i
+    VMADD3    U4,     D5,    D7,     U4
+    VMADD4    U5,     D4,    D7,     U5
+
+    vld       D0,     A0,    0x30  // a6ri a7ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a6rr a7rr
+    vshuf4i.w  D5,     D0,    0xf5  //a6ii a7ii
+
+    VMADD1    U6,     D4,    D6,     U6  //06r 16r 07r 17r
+    VMADD2    U7,     D5,    D6,     U7  //06i 16i 07i 17i
+    VMADD3    U6,     D5,    D7,     U6
+    VMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U2,    VALPHAR
+    vfmul.s      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14 res05 res15
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U4,    VALPHAR
+    vfmul.s      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16 res07 res17
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U6,    VALPHAR
+    vfmul.s      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14 res05 res15
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16 res07 res17
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L280
+
+.L25:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a0rr a1rr
+    vshuf4i.w  D5,     D0,    0xf5  //a0ii a1ii
+
+    vshuf4i.w  D6,     D2,    0x88  //b0r b1r b0r b1r
+    vshuf4i.w  D7,     D2,    0xdd  //b0i b1i b0i b1i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    vld       D0,     A0,    0x10  // a2ri a3ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a2rr a3rr
+    vshuf4i.w  D5,     D0,    0xf5  //a2ii a3ii
+
+    VMADD1    U2,     D4,    D6,     U2  //02r 12r 03r 13r
+    VMADD2    U3,     D5,    D6,     U3  //02i 12i 03i 13i
+    VMADD3    U2,     D5,    D7,     U2
+    VMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U2,    VALPHAR
+    vfmul.s      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12 res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D2,     B0,    0x00  // b0ri b1ri
+
+    vshuf4i.w  D4,     D0,    0xa0  //a0rr a1rr
+    vshuf4i.w  D5,     D0,    0xf5  //a0ii a1ii
+
+    vshuf4i.w  D6,     D2,    0x88  //b0r b1r b0r b1r
+    vshuf4i.w  D7,     D2,    0xdd  //b0i b1i b0i b1i
+
+    VMADD1    U0,     D4,    D6,     U0  //00r 10r 01r 11r
+    VMADD2    U1,     D5,    D6,     U1  //00i 10i 01i 11i
+    VMADD3    U0,     D5,    D7,     U0
+    VMADD4    U1,     D4,    D7,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10 res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    vpackev.w D2,     D1,    D0  //0 4 2 6
+    vpackod.w D3,     D1,    D0  //1 5 3 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.w D4,     D3,    D2  //0 1 2 3
+    vpackod.w D5,     D3,    D2  //4 5 6 7
+
+    vst       D4,     C0,    0x00 //c0: 0 1 2 3
+    vst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+    LD         b3,     B0,    0x08        //b1r
+    LD         b4,     B0,    0x0c        //b1i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res10r
+    MADD2      c22,    a2,    b3,     c22  //res10i
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C1,    0x00    //C1[0]
+    LD         a8,     C1,    0x04    //C1[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MUL       a7,     c21,   ALPHA_R
+    MUL       a8,     c22,   ALPHA_R
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C1,    0x00
+    ST         a8,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#else
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C1,    0x00    //C1[0]
+    LD         a8,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C1,    0x00
+    ST         a8,     C1,    0x04
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L34
+
+.L31:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x03
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D1,     A0,    0x10  // a2ri a3ri
+
+    vldrepl.w D2,     B0,    0x00 //b0r
+    vldrepl.w D3,     B0,    0x04 //b0i
+
+    vpackev.w D4,     D1,    D0
+    vshuf4i.w  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    vpackod.w D5,     D1,    D0
+    vshuf4i.w  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    VMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    VMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    VMADD3    U0,     D5,    D3,     U0
+    VMADD4    U1,     D4,    D3,     U1
+
+    vld       D0,     A0,    0x20  // a4ri a5ri
+    vld       D1,     A0,    0x30  // a6ri a7ri
+
+    vpackev.w D4,     D1,    D0
+    vshuf4i.w  D4,     D4,    0xd8  //a4r a5r a6r a7r
+
+    vpackod.w D5,     D1,    D0
+    vshuf4i.w  D5,     D5,    0xd8  //a4i a5i a6i a7i
+
+    VMADD1    U2,     D4,    D2,     U2  //04r 05r 06r 07r
+    VMADD2    U3,     D5,    D2,     U3  //04i 05i 06i 07i
+    VMADD3    U2,     D5,    D3,     U2
+    VMADD4    U3,     D4,    D3,     U3
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    //res04 res05 res06 res07
+    vld       D0,     C0,    0x20 //c0: 8 9 10 11
+    vld       D1,     C0,    0x30 //c0: 12 13 14 15
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //8 10 12 14
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //9 11 13 15
+
+    vfmul.s      D2,    U2,    VALPHAR
+    vfmul.s      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //8 10 12 14
+    vpermi.w  D4,     D2,   0x44 //8 10 9 11
+    vshuf4i.w  D4,     D4,   0xd8 //8 9 10 11
+
+    vand.v    D5,     D3,   D3  //9 11 13 15
+    vpermi.w  D5,     D2,   0xee //12 14 13 15
+    vshuf4i.w  D5,     D5,   0xd8 //12 13 14 15
+
+    vst       D4,     C0,    0x20
+    vst       D5,     C0,    0x30
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    //res04 res05 res06 res07
+    vld       D0,     C0,    0x20 //c0: 8 9 10 11
+    vld       D1,     C0,    0x30 //c0: 12 13 14 15
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //8 10 12 14
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //9 11 13 15
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //8 10 12 14
+    vpermi.w  D4,     D2,   0x44 //8 10 9 11
+    vshuf4i.w  D4,     D4,   0xd8 //8 9 10 11
+
+    vand.v    D5,     D3,   D3  //9 11 13 15
+    vpermi.w  D5,     D2,   0xee //12 14 13 15
+    vshuf4i.w  D5,     D5,   0xd8 //12 13 14 15
+
+    vst       D4,     C0,    0x20
+    vst       D5,     C0,    0x30
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L31
+
+.L34:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L38
+
+.L35:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L37
+    blt        TL,     L,     .L37
+
+.L36:  /* for (k=0; k<temp; k++) */
+    vld       D0,     A0,    0x00  // a0ri a1ri
+    vld       D1,     A0,    0x10  // a2ri a3ri
+
+    vldrepl.w D2,     B0,    0x00 //b0r
+    vldrepl.w D3,     B0,    0x04 //b0i
+
+    vpackev.w D4,     D1,    D0
+    vshuf4i.w  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    vpackod.w D5,     D1,    D0
+    vshuf4i.w  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    VMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    VMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    VMADD3    U0,     D5,    D3,     U0
+    VMADD4    U1,     D4,    D3,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L36
+
+.L37:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    vfmul.s      D2,    U0,    VALPHAR
+    vfmul.s      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01 res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1 2 3
+    vld       D1,     C0,    0x10 //c0: 4 5 6 7
+
+    vpackev.w D2,     D1,    D0
+    vshuf4i.w  D2,     D2,    0xd8  //0 2 4 6
+    vpackod.w D3,     D1,    D0
+    vshuf4i.w  D3,     D3,    0xd8  //1 3 5 7
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vand.v    D4,     D3,   D3  //1 3 5 7
+    vpermi.w  D4,     D2,   0x44 //0 2 1 3
+    vshuf4i.w  D4,     D4,   0xd8 //0 1 2 3
+
+    vand.v    D5,     D3,   D3  //1 3 5 7
+    vpermi.w  D5,     D2,   0xee //4 6 5 7
+    vshuf4i.w  D5,     D5,   0xd8 //4 5 6 7
+
+    vst       D4,     C0,    0x00
+    vst       D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00        //a0r
+    LD         a2,     A0,    0x04        //a0i
+    LD         a3,     A0,    0x08        //a1r
+    LD         a4,     A0,    0x0c        //a1i
+
+    LD         b1,     B0,    0x00        //b0r
+    LD         b2,     B0,    0x04        //b0i
+
+    MADD1      c11,    a1,    b1,     c11  //res00r
+    MADD2      c12,    a2,    b1,     c12  //res00i
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a3,    b1,     c21  //res10r
+    MADD2      c22,    a4,    b1,     c22  //res10i
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C0[2]
+    LD         a8,     C0,    0x0c    //C0[3]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MUL       a7,     c21,   ALPHA_R
+    MUL       a8,     c22,   ALPHA_R
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#else
+    //res00 res10
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C0[2]
+    LD         a8,     C0,    0x0c    //C0[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x03
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x03
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x03
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x04
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x04
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    addi.d     C0,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x03
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_16_lasx.S b/kernel/loongarch64/cgemm_ncopy_16_lasx.S
new file mode 100644
index 000000000..7c2d0ac64
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_16_lasx.S
@@ -0,0 +1,691 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define S11    $r24
+#define S12    $r25
+#define S13    $r26
+#define S14    $r27
+#define S15    $r28
+#define S16    $r29
+#define TD     $r30
+#define TS     $r31
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -0x90
+    SDARG      $r23, $sp,  0x00
+    SDARG      $r24, $sp,  0x08
+    SDARG      $r25, $sp,  0x10
+    SDARG      $r26, $sp,  0x18
+    SDARG      $r27, $sp,  0x20
+    SDARG      $r28, $sp,  0x28
+    SDARG      $r29, $sp,  0x30
+    SDARG      $r30, $sp,  0x38
+    SDARG      $r31, $sp,  0x40
+    ST         $f23, $sp,  0x48
+    ST         $f24, $sp,  0x50
+    ST         $f25, $sp,  0x58
+    ST         $f26, $sp,  0x60
+    ST         $f27, $sp,  0x68
+    ST         $f28, $sp,  0x70
+    ST         $f29, $sp,  0x78
+    ST         $f30, $sp,  0x80
+    ST         $f31, $sp,  0x88
+
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x04
+    beq        J,    ZERO, .L_N8
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    addi.d     J,    J,    -1
+    add.d      S4,   S3,   TL
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      S9,   S7,   T0
+    add.d      S10,  S8,   T0
+    add.d      S11,  S9,   T0
+    add.d      S12,  S10,  T0
+    add.d      S13,  S11,  T0
+    add.d      S14,  S12,  T0
+    add.d      S15,  S13,  T0
+    add.d      S16,  S14,  T0
+    add.d      TS,   S15,  T0
+    beq        I,    ZERO, .L_I7
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+    xvld       U8,   S9,   0x00
+    xvld       U9,   S10,  0x00
+    xvld       U10,  S11,  0x00
+    xvld       U11,  S12,  0x00
+    xvld       U12,  S13,  0x00
+    xvld       U13,  S14,  0x00
+    xvld       U14,  S15,  0x00
+    xvld       U15,  S16,  0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvpackev.d D8,   U9,   U8
+    xvpackod.d D9,   U9,   U8
+    xvpackev.d D10,  U11,  U10
+    xvpackod.d D11,  U11,  U10
+    xvpackev.d D12,  U13,  U12
+    xvpackod.d D13,  U13,  U12
+    xvpackev.d D14,  U15,  U14
+    xvpackod.d D15,  U15,  U14
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 4
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 5
+    xvpermi.q  D2,   U0,   0x31  // 8
+    xvpermi.q  D6,   U4,   0x31  // 9
+    xvpermi.q  D3,   U1,   0x31  // 12
+    xvpermi.q  D7,   U5,   0x31  // 13
+
+    xvand.v    U8,   D8,   D8
+    xvpermi.q  D8,   D10,  0x02  // 2
+    xvand.v    U12,  D12,  D12
+    xvpermi.q  D12,  D14,  0x02  // 3
+    xvand.v    U9,   D9,   D9
+    xvpermi.q  D9,   D11,  0x02  // 6
+    xvand.v    U13,  D13,  D13
+    xvpermi.q  D13,  D15,  0x02  // 7
+    xvpermi.q  D10,  U8,   0x31  // 10
+    xvpermi.q  D14,  U12,  0x31  // 11
+    xvpermi.q  D11,  U9,   0x31  // 14
+    xvpermi.q  D15,  U13,  0x31  // 15
+
+    xvst       D0,   TD,   0x00  // 0
+    xvst       D4,   TD,   0x20  // 1
+    xvst       D8,   TD,   0x40  // 2
+    xvst       D12,  TD,   0x60  // 3
+    xvst       D1,   TD,   0x80  // 4
+    xvst       D5,   TD,   0xA0  // 5
+    xvst       D9,   TD,   0xC0  // 6
+    xvst       D13,  TD,   0xE0  // 7
+    addi.d     TD,   TD,   0x100
+    xvst       D2,   TD,   0x00  // 8
+    xvst       D6,   TD,   0x20  // 9
+    xvst       D10,  TD,   0x40  // 10
+    xvst       D14,  TD,   0x60  // 11
+    xvst       D3,   TD,   0x80  // 12
+    xvst       D7,   TD,   0xA0  // 13
+    xvst       D11,  TD,   0xC0  // 14
+    xvst       D15,  TD,   0xE0  // 15
+    addi.d     TD,   TD,   0x100
+
+    xvld       U0,   S1,   0x20
+    xvld       U1,   S2,   0x20
+    xvld       U2,   S3,   0x20
+    xvld       U3,   S4,   0x20
+    xvld       U4,   S5,   0x20
+    xvld       U5,   S6,   0x20
+    xvld       U6,   S7,   0x20
+    xvld       U7,   S8,   0x20
+    xvld       U8,   S9,   0x20
+    xvld       U9,   S10,  0x20
+    xvld       U10,  S11,  0x20
+    xvld       U11,  S12,  0x20
+    xvld       U12,  S13,  0x20
+    xvld       U13,  S14,  0x20
+    xvld       U14,  S15,  0x20
+    xvld       U15,  S16,  0x20
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvpackev.d D8,   U9,   U8
+    xvpackod.d D9,   U9,   U8
+    xvpackev.d D10,  U11,  U10
+    xvpackod.d D11,  U11,  U10
+    xvpackev.d D12,  U13,  U12
+    xvpackod.d D13,  U13,  U12
+    xvpackev.d D14,  U15,  U14
+    xvpackod.d D15,  U15,  U14
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 4
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 5
+    xvpermi.q  D2,   U0,   0x31  // 8
+    xvpermi.q  D6,   U4,   0x31  // 9
+    xvpermi.q  D3,   U1,   0x31  // 12
+    xvpermi.q  D7,   U5,   0x31  // 13
+
+    xvand.v    U8,   D8,   D8
+    xvpermi.q  D8,   D10,  0x02  // 2
+    xvand.v    U12,  D12,  D12
+    xvpermi.q  D12,  D14,  0x02  // 3
+    xvand.v    U9,   D9,   D9
+    xvpermi.q  D9,   D11,  0x02  // 6
+    xvand.v    U13,  D13,  D13
+    xvpermi.q  D13,  D15,  0x02  // 7
+    xvpermi.q  D10,  U8,   0x31  // 10
+    xvpermi.q  D14,  U12,  0x31  // 11
+    xvpermi.q  D11,  U9,   0x31  // 14
+    xvpermi.q  D15,  U13,  0x31  // 15
+
+    xvst       D0,   TD,   0x00  // 0
+    xvst       D4,   TD,   0x20  // 1
+    xvst       D8,   TD,   0x40  // 2
+    xvst       D12,  TD,   0x60  // 3
+    xvst       D1,   TD,   0x80  // 4
+    xvst       D5,   TD,   0xA0  // 5
+    xvst       D9,   TD,   0xC0  // 6
+    xvst       D13,  TD,   0xE0  // 7
+    addi.d     TD,   TD,   0x100
+    xvst       D2,   TD,   0x00  // 8
+    xvst       D6,   TD,   0x20  // 9
+    xvst       D10,  TD,   0x40  // 10
+    xvst       D14,  TD,   0x60  // 11
+    xvst       D3,   TD,   0x80  // 12
+    xvst       D7,   TD,   0xA0  // 13
+    xvst       D11,  TD,   0xC0  // 14
+    xvst       D15,  TD,   0xE0  // 15
+    addi.d     TD,   TD,   0x100
+
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+    addi.d     S9,   S9,   0x40
+    addi.d     S10,  S10,  0x40
+    addi.d     S11,  S11,  0x40
+    addi.d     S12,  S12,  0x40
+    addi.d     S13,  S13,  0x40
+    addi.d     S14,  S14,  0x40
+    addi.d     S15,  S15,  0x40
+    addi.d     S16,  S16,  0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_I0
+
+.L_II1: /* I-- */
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+    addi.d    TD,    TD,  0x40
+
+    fld.d     F0,    S9,  0x00
+    fld.d     F1,    S10, 0x00
+    fld.d     F2,    S11, 0x00
+    fld.d     F3,    S12, 0x00
+    fld.d     F4,    S13, 0x00
+    fld.d     F5,    S14, 0x00
+    fld.d     F6,    S15, 0x00
+    fld.d     F7,    S16, 0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S9,    S9,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S10,   S10, 0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S11,   S11, 0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S12,   S12, 0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S13,   S13, 0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S14,   S14, 0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S15,   S15, 0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S16,   S16, 0x08
+    addi.d    TD,    TD,  0x40
+
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+
+.L_N8:
+    andi      J,     N,   0x08
+    beq       ZERO,  J,   .L_N4
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      TS,   S7,   T0
+    beq        I,    ZERO, .L_8I3
+
+.L_8I1:  /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 2
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 3
+    xvpermi.q  D2,   U0,   0x31  // 4
+    xvpermi.q  D6,   U4,   0x31  // 5
+    xvpermi.q  D3,   U1,   0x31  // 6
+    xvpermi.q  D7,   U5,   0x31  // 7
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D5,   TD,   0x60
+    xvst       D2,   TD,   0x80
+    xvst       D6,   TD,   0xA0
+    xvst       D3,   TD,   0xC0
+    xvst       D7,   TD,   0xE0
+    addi.d     TD,   TD,   0x100
+
+    xvld       U0,   S1,   0x20
+    xvld       U1,   S2,   0x20
+    xvld       U2,   S3,   0x20
+    xvld       U3,   S4,   0x20
+    xvld       U4,   S5,   0x20
+    xvld       U5,   S6,   0x20
+    xvld       U6,   S7,   0x20
+    xvld       U7,   S8,   0x20
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+    xvpackev.d D4,   U5,   U4
+    xvpackod.d D5,   U5,   U4
+    xvpackev.d D6,   U7,   U6
+    xvpackod.d D7,   U7,   U6
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U4,   D4,   D4
+    xvpermi.q  D4,   D6,   0x02  // 1
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 2
+    xvand.v    U5,   D5,   D5
+    xvpermi.q  D5,   D7,   0x02  // 3
+    xvpermi.q  D2,   U0,   0x31  // 4
+    xvpermi.q  D6,   U4,   0x31  // 5
+    xvpermi.q  D3,   U1,   0x31  // 6
+    xvpermi.q  D7,   U5,   0x31  // 7
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D5,   TD,   0x60
+    xvst       D2,   TD,   0x80
+    xvst       D6,   TD,   0xA0
+    xvst       D3,   TD,   0xC0
+    xvst       D7,   TD,   0xE0
+    addi.d     TD,   TD,   0x100
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_8I1
+
+.L_8I3:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_N4
+
+.L_8I11:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+
+    addi.d    TD,    TD,  0x40
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_8I11
+
+.L_N4:
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    beq        I,    ZERO, .L_I3
+
+.L_4I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+    xvpackev.d D2,   U3,   U2
+    xvpackod.d D3,   U3,   U2
+
+    xvand.v    U0,   D0,   D0
+    xvpermi.q  D0,   D2,   0x02  // 0
+    xvand.v    U1,   D1,   D1
+    xvpermi.q  D1,   D3,   0x02  // 1
+    xvpermi.q  D2,   U0,   0x31  // 2
+    xvpermi.q  D3,   U1,   0x31  // 3
+
+    xvst       D0,   TD,   0x00
+    xvst       D1,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_4I1
+
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N2
+
+.L_4II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_4II1
+
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_NI1
+
+.L_2I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+
+    xvpackev.d D0,   U1,   U0
+    xvpackod.d D1,   U1,   U0
+
+    xvpermi.q  D0,   D1,   0x02  // 0
+
+    xvst       D0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+
+.L_NI1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    addi.d    TD,    TD,  0x10
+
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+
+.L_N0:
+    LDARG      $r23, $sp,  0x00
+    LDARG      $r24, $sp,  0x08
+    LDARG      $r25, $sp,  0x10
+    LDARG      $r26, $sp,  0x18
+    LDARG      $r27, $sp,  0x20
+    LDARG      $r28, $sp,  0x28
+    LDARG      $r29, $sp,  0x30
+    LDARG      $r30, $sp,  0x38
+    LDARG      $r31, $sp,  0x40
+    LD         $f23, $sp,  0x48
+    LD         $f24, $sp,  0x50
+    LD         $f25, $sp,  0x58
+    LD         $f26, $sp,  0x60
+    LD         $f27, $sp,  0x68
+    LD         $f28, $sp,  0x70
+    LD         $f29, $sp,  0x78
+    LD         $f30, $sp,  0x80
+    LD         $f31, $sp,  0x88
+    addi.d     $sp,  $sp,  0x90
+    jirl       $r0,  $r1,  0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_2_lasx.S b/kernel/loongarch64/cgemm_ncopy_2_lasx.S
new file mode 100644
index 000000000..10dc48c85
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_2_lasx.S
@@ -0,0 +1,193 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x00
+    xvld       U2,   S2,   0x00
+
+    xvpermi.q  U0,   U2,   0x02
+    xvpermi.q  U2,   U1,   0x31
+
+    xvpermi.d  U0,   U0,   0xd8
+    xvpermi.d  U2,   U2,   0xd8
+
+    xvst       U0,   TD,   0x00
+    xvst       U2,   TD,   0x20
+
+    addi.d     S1,   S1,   0x20   // a_offset1
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    xvld       U0,    TS,   0x00
+
+    xvst       U0,    TD,   0x00
+
+    addi.d     TS,    TS,   0x20   // a_offset
+    addi.d     TD,    TD,   0x20   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,    TS,   0x00
+    fld.s      F1,    TS,   0x04
+
+    fst.s      F0,    TD,   0x00
+    fst.s      F1,    TD,   0x04
+
+    addi.d     TS,    TS,   0x08   // a_offset
+    addi.d     TD,    TD,   0x08   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_2_lsx.S b/kernel/loongarch64/cgemm_ncopy_2_lsx.S
new file mode 100644
index 000000000..1cf4d87dc
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S
@@ -0,0 +1,202 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vand.v    D0,   U2,   U2
+    vand.v    D1,   U3,   U3
+    vand.v    D2,   U2,   U2
+    vand.v    D3,   U3,   U3
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D2,   U0,   0xee
+    vpermi.w  D1,   U1,   0x44
+    vpermi.w  D3,   U1,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D2,   TD,   0x10
+    vst       D1,   TD,   0x20
+    vst       D3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x20   // a_offset1
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    vld       U0,    TS,   0x00
+    vld       U1,    TS,   0x10
+
+    vst       U0,    TD,   0x00
+    vst       U1,    TD,   0x10
+
+    addi.d     TS,    TS,   0x20   // a_offset
+    addi.d     TD,    TD,   0x20   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,    TS,   0x00
+    fld.s      F1,    TS,   0x04
+
+    fst.s      F0,    TD,   0x00
+    fst.s      F1,    TD,   0x04
+
+    addi.d     TS,    TS,   0x08   // a_offset
+    addi.d     TD,    TD,   0x08   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_4_lasx.S b/kernel/loongarch64/cgemm_ncopy_4_lasx.S
new file mode 100644
index 000000000..24c2e6748
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_4_lasx.S
@@ -0,0 +1,325 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00 //1 2 3 4 5 6 7 8
+    xvld       U1,   S2,   0x00 //9 10 11 12 13 14 15 16
+    xvld       U2,   S3,   0x00 //17 18 19 20 21 22 23 24
+    xvld       U3,   S4,   0x00 //25 26 27 28 29 30 31 32
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+
+    xvshuf4i.d D0,   U1,  0x88 //1 2 9 10 5 6 13 14
+    xvshuf4i.d D2,   U3,  0x88 //17 18 25 26 21 22 29 30
+    xvshuf4i.d D1,   U0,  0x77 //3 4 11 12 7 8 15 16
+    xvshuf4i.d D3,   U2,  0x77 //19 20 27 28 23 24 31 32
+
+    xvand.v    U4,   D0,   D0
+    xvand.v    U5,   D1,   D1
+
+    xvpermi.q  U4,   D2,  0x02 //1 2 9 10 17 18 25 26
+    xvpermi.q  U5,   D3,  0x02 //3 4 11 12 19 20 27 28
+    xvpermi.q  D2,   D0,  0x31 //5 6 13 14 21 22 29 30
+    xvpermi.q  D3,   D1,  0x31 //7 8 15 16 23 24 31 32
+
+    xvst       U4,   TD,   0x00
+    xvst       U5,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20   // a_offset
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:  /* if(m&2) */
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+    vld       $vr0,   S1,   0x00
+    vld       $vr1,   S2,   0x00
+    vld       $vr2,   S3,   0x00
+    vld       $vr3,   S4,   0x00
+
+    vand.v    $vr8,   $vr1,   $vr1
+    vand.v    $vr9,   $vr1,   $vr1
+    vand.v    $vr10,   $vr3,   $vr3
+    vand.v    $vr11,   $vr3,   $vr3
+
+    vpermi.w  $vr8,   $vr0,   0x44
+    vpermi.w  $vr10,   $vr2,   0x44
+    vpermi.w  $vr9,   $vr0,   0xee
+    vpermi.w  $vr11,   $vr2,   0xee
+
+    vst       $vr8,   TD,   0x00
+    vst       $vr10,   TD,   0x10
+    vst       $vr9,   TD,   0x20
+    vst       $vr11,   TD,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     TD,   TD,   0x40
+
+.L_II20:  /* if(m&1) */
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    addi.d     TD,   TD,   0x20
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    xvld       U0,   S1,   0x00 //1 2 3 4 5 6 7 8
+    xvld       U1,   S2,   0x00 //9 10 11 12 13 14 15 16
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+
+    xvshuf4i.d D0,   U1,  0x88 //1 2 9 10 5 6 13 14
+    xvshuf4i.d D1,   U0,  0x77 //3 4 11 12 7 8 15 16
+
+    xvand.v    U4,   D0,   D0
+
+    xvpermi.q  U4,   D1,  0x02 //1 2 9 10 3 4 11 12
+    xvpermi.q  D1,   D0,  0x31 //5 6 13 14 7 8 15 16
+
+    xvst       U4,   TD,   0x00
+    xvst       D1,   TD,   0x20
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    vld       $vr0,    S1,   0x00
+    vld       $vr1,    S2,   0x00
+    vand.v    $vr8,    $vr1,   $vr1
+
+    vpermi.w  $vr8,    $vr0,   0x44
+    vpermi.w  $vr1,    $vr0,   0xee
+
+    vst       $vr8,    TD,   0x00
+    vst       $vr1,    TD,   0x10
+
+    addi.d     S1,    S1,   0x10   // a_offset
+    addi.d     S2,    S2,   0x10
+    addi.d     TD,    TD,   0x20   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    fld.s      F0,     S1,   0x00
+    fld.s      F1,     S1,   0x04
+
+    fld.s      F2,     S2,   0x00
+    fld.s      F3,     S2,   0x04
+
+    fst.s      F0,     TD,   0x00
+    fst.s      F1,     TD,   0x04
+    fst.s      F2,     TD,   0x08
+    fst.s      F3,     TD,   0x0c
+
+    addi.d     TD,    TD,   0x10
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    xvld       U0,   S1,   0x00
+
+    xvst       U0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    vld       $vr0,   S1,   0x00
+
+    vst       $vr0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x10   // aoffset1
+    addi.d     TD,   TD,   0x10   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_4_lsx.S b/kernel/loongarch64/cgemm_ncopy_4_lsx.S
new file mode 100644
index 000000000..bfc712fb2
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_4_lsx.S
@@ -0,0 +1,341 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vand.v    D0,   U2,   U2
+    vand.v    D1,   U3,   U3
+    vand.v    D2,   U2,   U2
+    vand.v    D3,   U3,   U3
+    vand.v    D4,   U6,   U6
+    vand.v    D5,   U7,   U7
+    vand.v    D6,   U6,   U6
+    vand.v    D7,   U7,   U7
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D4,   U4,   0x44
+    vpermi.w  D2,   U0,   0xee
+    vpermi.w  D6,   U4,   0xee
+    vpermi.w  D1,   U1,   0x44
+    vpermi.w  D5,   U5,   0x44
+    vpermi.w  D3,   U1,   0xee
+    vpermi.w  D7,   U5,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D4,   TD,   0x10
+    vst       D2,   TD,   0x20
+    vst       D6,   TD,   0x30
+    vst       D1,   TD,   0x40
+    vst       D5,   TD,   0x50
+    vst       D3,   TD,   0x60
+    vst       D7,   TD,   0x70
+
+    addi.d     S1,   S1,   0x20   // a_offset
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:  /* if(m&2) */
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S2,   0x00
+    vld       U2,   S3,   0x00
+    vld       U3,   S4,   0x00
+
+    vand.v    D0,   U1,   U1
+    vand.v    D1,   U1,   U1
+    vand.v    D2,   U3,   U3
+    vand.v    D3,   U3,   U3
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D2,   U2,   0x44
+    vpermi.w  D1,   U0,   0xee
+    vpermi.w  D3,   U2,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D2,   TD,   0x10
+    vst       D1,   TD,   0x20
+    vst       D3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     TD,   TD,   0x40
+
+.L_II20:  /* if(m&1) */
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    addi.d     TD,   TD,   0x20
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    vld       U0,    S1,   0x00
+    vld       U1,    S1,   0x10
+    vld       U2,    S2,   0x00
+    vld       U3,    S2,   0x10
+
+    vand.v    D0,    U2,   U2
+    vand.v    D1,    U3,   U3
+    vand.v    D2,    U2,   U2
+    vand.v    D3,    U3,   U3
+
+    vpermi.w  D0,    U0,   0x44
+    vpermi.w  D2,    U0,   0xee
+    vpermi.w  D1,    U1,   0x44
+    vpermi.w  D3,    U1,   0xee
+
+    vst       D0,    TD,   0x00
+    vst       D2,    TD,   0x10
+    vst       D1,    TD,   0x20
+    vst       D3,    TD,   0x30
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    vld       U0,    S1,   0x00
+    vld       U1,    S2,   0x00
+    vand.v    D0,    U1,   U1
+
+    vpermi.w  D0,    U0,   0x44
+    vpermi.w  U1,    U0,   0xee
+
+    vst       D0,    TD,   0x00
+    vst       U1,    TD,   0x10
+
+    addi.d     S1,    S1,   0x10   // a_offset
+    addi.d     S2,    S2,   0x10
+    addi.d     TD,    TD,   0x20   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    fld.s      F0,     S1,   0x00
+    fld.s      F1,     S1,   0x04
+
+    fld.s      F2,     S2,   0x00
+    fld.s      F3,     S2,   0x04
+
+    fst.s      F0,     TD,   0x00
+    fst.s      F1,     TD,   0x04
+    fst.s      F2,     TD,   0x08
+    fst.s      F3,     TD,   0x0c
+
+    addi.d     TD,    TD,   0x10
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vst       U0,   TD,   0x00
+    vst       U1,   TD,   0x10
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    vld       U0,   S1,   0x00
+
+    vst       U0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x10   // aoffset1
+    addi.d     TD,   TD,   0x10   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_ncopy_8_lsx.S b/kernel/loongarch64/cgemm_ncopy_8_lsx.S
new file mode 100644
index 000000000..87a88e37d
--- /dev/null
+++ b/kernel/loongarch64/cgemm_ncopy_8_lsx.S
@@ -0,0 +1,263 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST  //boffset
+    move       TS,   SRC  //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+
+    slli.d     T0,   TL,   0x03
+    srai.d     J,    N,    0x03  //j
+
+    beq        J,    ZERO, .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      S5,   S4,   TL
+    add.d      S6,   S5,   TL
+    add.d      S7,   S6,   TL
+    add.d      S8,   S7,   TL
+    add.d      TS,   TS,   T0
+
+    beq        I,    ZERO, .L_J11
+
+.L_I1:  /* if(i>0) i--*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    fld.s      F0,   S5,   0x00
+    fld.s      F1,   S5,   0x04
+    fld.s      F2,   S6,   0x00
+    fld.s      F3,   S6,   0x04
+    fld.s      F4,   S7,   0x00
+    fld.s      F5,   S7,   0x04
+    fld.s      F6,   S8,   0x00
+    fld.s      F7,   S8,   0x04
+
+    fst.s      F0,   TD,   0x20
+    fst.s      F1,   TD,   0x24
+    fst.s      F2,   TD,   0x28
+    fst.s      F3,   TD,   0x2c
+    fst.s      F4,   TD,   0x30
+    fst.s      F5,   TD,   0x34
+    fst.s      F6,   TD,   0x38
+    fst.s      F7,   TD,   0x3c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     S3,   S3,   0x08
+    addi.d     S4,   S4,   0x08
+    addi.d     S5,   S5,   0x08
+    addi.d     S6,   S6,   0x08
+    addi.d     S7,   S7,   0x08
+    addi.d     S8,   S8,   0x08
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_J11: /* j--*/
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_J1
+
+.L_N1:  /* if(n&4)*/
+    andi       I,     N,    0x04
+    beq        I,     ZERO, .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      TS,   S4,   TL
+
+    beq        I,     ZERO, .L_N2
+
+.L_N11:  /* if(i>0)*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+    fst.s      F4,   TD,   0x10
+    fst.s      F5,   TD,   0x14
+    fst.s      F6,   TD,   0x18
+    fst.s      F7,   TD,   0x1c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     S3,   S3,   0x08
+    addi.d     S4,   S4,   0x08
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N11
+
+.L_N2:  /* if(n&2)*/
+    andi       I,     N,    0x02
+    beq        I,     ZERO, .L_N3
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      TS,   S2,   TL
+
+    beq        I,    ZERO, .L_N3
+
+.L_N21:  /* if(i>0)*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N21
+
+.L_N3:  /* if(n&2)*/
+    andi       I,    N,    0x01
+    beq        I,    ZERO, .L_N0
+
+    move       S1,   TS
+    move       I,    M
+
+    beq        I,    ZERO, .L_N0
+
+.L_N31:  /* if(i>0)*/
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+
+    addi.d     S1,   S1,   0x08
+    addi.d     TD,   TD,   0x08
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N31
+
+.L_N0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_16_lasx.S b/kernel/loongarch64/cgemm_tcopy_16_lasx.S
new file mode 100644
index 000000000..7d9eb94c8
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_16_lasx.S
@@ -0,0 +1,741 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+#define F8     $f8
+#define F9     $f9
+#define F10     $f10
+#define F11     $f11
+#define F12     $f12
+#define F13     $f13
+#define F14     $f14
+#define F15     $f15
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -56
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    SDARG      $r28,   $sp,   40
+    SDARG      $r29,   $sp,   48
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    srai.d     T0,     N,     0x04
+    srai.d     T1,     N,     0x03
+    slli.d     T0,     T0,    0x04
+    slli.d     T1,     T1,    0x03
+    mul.d      P2,     M,     T0
+    mul.d      P3,     M,     T1
+    slli.d     P2,     P2,    0x03
+    slli.d     P3,     P3,    0x03
+    add.d      P2,     DST,   P2
+    add.d      P3,     DST,   P3
+
+    srai.d     T0,     N,     0x02
+    srai.d     T1,     N,     0x01
+    slli.d     T0,     T0,    0x02
+    slli.d     T1,     T1,    0x01
+    mul.d      P4,     M,     T0
+    mul.d      P5,     M,     T1
+    slli.d     P4,     P4,    0x03
+    slli.d     P5,     P5,    0x03
+    add.d      P4,     DST,   P4
+    add.d      P5,     DST,   P5
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x03
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x07
+    beq        ZERO,   J,     .L_M7
+
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S5,     S3,    T0
+    add.d      S6,     S4,    T0
+    add.d      S7,     S5,    T0
+    add.d      S8,     S6,    T0
+    add.d      S0,     S7,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x400
+
+    srai.d     I,      N,     0x04
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N15
+
+.L_I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S3,    0x00
+    xvld       U1,     S3,    0x20
+    xvld       U2,     S3,    0x40
+    xvld       U3,     S3,    0x60
+    xvld       U4,     S4,    0x00
+    xvld       U5,     S4,    0x20
+    xvld       U6,     S4,    0x40
+    xvld       U7,     S4,    0x60
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S5,    0x40
+    xvld       U3,     S5,    0x60
+    xvld       U4,     S6,    0x00
+    xvld       U5,     S6,    0x20
+    xvld       U6,     S6,    0x40
+    xvld       U7,     S6,    0x60
+
+    xvst       U0,     P1,    0x200
+    xvst       U1,     P1,    0x220
+    xvst       U2,     P1,    0x240
+    xvst       U3,     P1,    0x260
+    xvst       U4,     P1,    0x280
+    xvst       U5,     P1,    0x2A0
+    xvst       U6,     P1,    0x2C0
+    xvst       U7,     P1,    0x2E0
+
+    xvld       U0,     S7,    0x00
+    xvld       U1,     S7,    0x20
+    xvld       U2,     S7,    0x40
+    xvld       U3,     S7,    0x60
+    xvld       U4,     S8,    0x00
+    xvld       U5,     S8,    0x20
+    xvld       U6,     S8,    0x40
+    xvld       U7,     S8,    0x60
+
+    xvst       U0,     P1,    0x300
+    xvst       U1,     P1,    0x320
+    xvst       U2,     P1,    0x340
+    xvst       U3,     P1,    0x360
+    xvst       U4,     P1,    0x380
+    xvst       U5,     P1,    0x3A0
+    xvst       U6,     P1,    0x3C0
+    xvst       U7,     P1,    0x3E0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     S3,     S3,    0x80
+    addi.d     S4,     S4,    0x80
+    addi.d     S5,     S5,    0x80
+    addi.d     S6,     S6,    0x80
+    addi.d     S7,     S7,    0x80
+    addi.d     S8,     S8,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_I1
+
+.L_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+    xvst       U4,     P2,    0x80
+    xvst       U5,     P2,    0xA0
+    xvst       U6,     P2,    0xC0
+    xvst       U7,     P2,    0xE0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S6,    0x00
+    xvld       U3,     S6,    0x20
+    xvld       U4,     S7,    0x00
+    xvld       U5,     S7,    0x20
+    xvld       U6,     S8,    0x00
+    xvld       U7,     S8,    0x20
+
+    xvst       U0,     P2,    0x100
+    xvst       U1,     P2,    0x120
+    xvst       U2,     P2,    0x140
+    xvst       U3,     P2,    0x160
+    xvst       U4,     P2,    0x180
+    xvst       U5,     P2,    0x1A0
+    xvst       U6,     P2,    0x1C0
+    xvst       U7,     P2,    0x1E0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     S5,     S5,    0x40
+    addi.d     S6,     S6,    0x40
+    addi.d     S7,     S7,    0x40
+    addi.d     S8,     S8,    0x40
+    addi.d     P2,     P2,    0x200
+
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+    xvst       U2,     P3,    0x40
+    xvst       U3,     P3,    0x60
+    xvst       U4,     P3,    0x80
+    xvst       U5,     P3,    0xA0
+    xvst       U6,     P3,    0xC0
+    xvst       U7,     P3,    0xE0
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     S5,     S5,    0x20
+    addi.d     S6,     S6,    0x20
+    addi.d     S7,     S7,    0x20
+    addi.d     S8,     S8,    0x20
+    addi.d     P3,     P3,    0x100
+
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+    xvpermi.q  U4,     U5,    0x02
+    xvpermi.q  U6,     U7,    0x02
+
+    xvst       U0,     P4,    0x00
+    xvst       U2,     P4,    0x20
+    xvst       U4,     P4,    0x40
+    xvst       U6,     P4,    0x60
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     S5,     S5,    0x10
+    addi.d     S6,     S6,    0x10
+    addi.d     S7,     S7,    0x10
+    addi.d     S8,     S8,    0x10
+    addi.d     P4,     P4,    0x80
+
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fld.s      F2,     S2,    0x00
+    fld.s      F3,     S2,    0x04
+
+    fld.s      F4,     S3,    0x00
+    fld.s      F5,     S3,    0x04
+
+    fld.s      F6,     S4,    0x00
+    fld.s      F7,     S4,    0x04
+
+    fld.s      F8,     S5,    0x00
+    fld.s      F9,     S5,    0x04
+
+    fld.s      F10,     S6,    0x00
+    fld.s      F11,     S6,    0x04
+
+    fld.s      F12,     S7,    0x00
+    fld.s      F13,     S7,    0x04
+
+    fld.s      F14,     S8,    0x00
+    fld.s      F15,     S8,    0x04
+
+    fst.s      F0,     P5,    0x00
+    fst.s      F1,     P5,    0x04
+    fst.s      F2,     P5,    0x08
+    fst.s      F3,     P5,    0x0c
+    fst.s      F4,     P5,    0x10
+    fst.s      F5,     P5,    0x14
+    fst.s      F6,     P5,    0x18
+    fst.s      F7,     P5,    0x1c
+    fst.s      F8,     P5,    0x20
+    fst.s      F9,     P5,    0x24
+    fst.s      F10,     P5,    0x28
+    fst.s      F11,     P5,    0x2c
+    fst.s      F12,     P5,    0x30
+    fst.s      F13,     P5,    0x34
+    fst.s      F14,     P5,    0x38
+    fst.s      F15,     P5,    0x3c
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     S5,     S5,    0x08
+    addi.d     S6,     S6,    0x08
+    addi.d     S7,     S7,    0x08
+    addi.d     S8,     S8,    0x08
+    addi.d     P5,     P5,    0x40
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x200
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_4N15
+
+.L_4I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S3,    0x00
+    xvld       U1,     S3,    0x20
+    xvld       U2,     S3,    0x40
+    xvld       U3,     S3,    0x60
+    xvld       U4,     S4,    0x00
+    xvld       U5,     S4,    0x20
+    xvld       U6,     S4,    0x40
+    xvld       U7,     S4,    0x60
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     S3,     S3,    0x80
+    addi.d     S4,     S4,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_4I1
+
+.L_4N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_4N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+    xvst       U4,     P2,    0x80
+    xvst       U5,     P2,    0xA0
+    xvst       U6,     P2,    0xC0
+    xvst       U7,     P2,    0xE0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     P2,     P2,    0x100
+
+.L_4N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_4N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+    xvst       U2,     P3,    0x40
+    xvst       U3,     P3,    0x60
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     P3,     P3,    0x80
+
+.L_4N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_4N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+    xvpermi.q  U2,     U3,    0x02
+
+    xvst       U0,     P4,    0x00
+    xvst       U2,     P4,    0x20
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P4,     P4,    0x40
+
+.L_4N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+    fst.d      F2,     P5,    0x10
+    fst.d      F3,     P5,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P5,     P5,    0x20
+
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S0,     S0,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x100
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_2N15
+
+.L_2I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    addi.d     S1,     S1,    0x80
+    addi.d     S2,     S2,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_2I1
+
+.L_2N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_2N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+    xvst       U2,     P2,    0x40
+    xvst       U3,     P2,    0x60
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     P2,     P2,    0x80
+
+.L_2N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_2N3
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     P3,    0x00
+    xvst       U1,     P3,    0x20
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     P3,     P3,    0x40
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvpermi.q  U0,     U1,    0x02
+
+    xvst       U0,     P4,    0x00
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     P4,     P4,    0x20
+
+.L_2N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+
+    fst.d      F0,     P5,    0x00
+    fst.d      F1,     P5,    0x08
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     P5,     P5,    0x10
+
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x04
+    beq        ZERO,   I,     .L_1N15
+
+.L_1I1: /* I-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    addi.d     S1,     S1,    0x80
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_1I1
+
+.L_1N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_1N7
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     P2,    0x00
+    xvst       U1,     P2,    0x20
+
+    addi.d     S1,     S1,    0x40
+    addi.d     P2,     P2,    0x40
+
+.L_1N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_1N3
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     P3,    0x00
+
+    addi.d     S1,     S1,    0x20
+    addi.d     P3,     P3,    0x20
+
+.L_1N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_1N1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S1,    0x08
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+
+    addi.d     S1,     S1,    0x10
+    addi.d     P4,     P4,    0x10
+
+.L_1N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    fld.d      F0,     S1,    0x00
+
+    fst.d      F0,     P5,    0x00
+
+    addi.d     S1,     S1,    0x08
+    addi.d     P5,     P5,    0x08
+
+.L_M0:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LDARG      $r28,   $sp,   40
+    LDARG      $r29,   $sp,   48
+    addi.d     $sp,    $sp,   56
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_2_lasx.S b/kernel/loongarch64/cgemm_tcopy_2_lasx.S
new file mode 100644
index 000000000..e2245e412
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_2_lasx.S
@@ -0,0 +1,218 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r7
+#define T0     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    srai.d     J,      M,     0x01  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01
+    add.d      S2,     S1,    TL   //aoffset2
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x20
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x00
+    xvld       U2,   S2,   0x00
+
+    xvpermi.q  U0,     U2,   0x02
+    xvpermi.q  U2,     U1,   0x31
+
+    xvst       U0,   S8,   0x00
+
+    slli.d     T0,   M,    0x04
+    add.d      S8,   S8,   T0
+
+    xvst       U2,   S8,   0x00
+
+    add.d      S8,   S8,   T0
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld        $vr0,   S1,   0x00
+    vld        $vr1,   S2,   0x00
+
+    vst        $vr0,   S8,   0x00
+    vst        $vr1,   S8,   0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   S9,   0x00
+    fst.s      F1,   S9,   0x04
+    fst.s      F2,   S9,   0x08
+    fst.s      F3,   S9,   0x0c
+
+    addi.d     S9,   S9,   0x10
+
+.L_J0:
+    addi.d     J,    J,    -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld        $vr0,     TS,    0x00
+    vld        $vr1,     TS,    0x10
+
+    vst       $vr0,     TD,    0x00
+
+    slli.d     T0,     M,     0x04
+    add.d      TD,     TD,    T0
+
+    vst       $vr1,     TD,    0x00
+
+    add.d      TD,     TD,    T0
+    addi.d     TS,     TS,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld        $vr0,     TS,    0x00
+
+    vst        $vr0,     TD,    0x00
+
+    addi.d     TS,     TS,    0x10
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     TS,    0x00
+    fld.s      F1,     TS,    0x04
+
+    fst.s      F0,     S9,    0x00
+    fst.s      F1,     S9,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_2_lsx.S b/kernel/loongarch64/cgemm_tcopy_2_lsx.S
new file mode 100644
index 000000000..15c0fde8f
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_2_lsx.S
@@ -0,0 +1,218 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r7
+#define T0     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    srai.d     J,      M,     0x01  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01
+    add.d      S2,     S1,    TL   //aoffset2
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x20
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vst       U0,   S8,   0x00
+    vst       U2,   S8,   0x10
+
+    slli.d     T0,   M,    0x04
+    add.d      S8,   S8,   T0
+
+    vst       U1,   S8,   0x00
+    vst       U3,   S8,   0x10
+
+    add.d      S8,   S8,   T0
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S2,   0x00
+
+    vst       U0,   S8,   0x00
+    vst       U1,   S8,   0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   S9,   0x00
+    fst.s      F1,   S9,   0x04
+    fst.s      F2,   S9,   0x08
+    fst.s      F3,   S9,   0x0c
+
+    addi.d     S9,   S9,   0x10
+
+.L_J0:
+    addi.d     J,    J,    -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld       U0,     TS,    0x00
+    vld       U1,     TS,    0x10
+
+    vst       U0,     TD,    0x00
+
+    slli.d     T0,     M,     0x04
+    add.d      TD,     TD,    T0
+
+    vst       U1,     TD,    0x00
+
+    add.d      TD,     TD,    T0
+    addi.d     TS,     TS,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       U0,     TS,    0x00
+
+    vst       U0,     TD,    0x00
+
+    addi.d     TS,     TS,    0x10
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     TS,    0x00
+    fld.s      F1,     TS,    0x04
+
+    fst.s      F0,     S9,    0x00
+    fst.s      F1,     S9,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_4_lasx.S b/kernel/loongarch64/cgemm_tcopy_4_lasx.S
new file mode 100644
index 000000000..9ff8a35b8
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_4_lasx.S
@@ -0,0 +1,306 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvst       U0,   S8,   0x00
+    xvst       U1,   S8,   0x20
+    xvst       U2,   S8,   0x40
+    xvst       U3,   S8,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    slli.d     T0,   M,    0x05
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       $vr0,   S1,   0x00
+    vld       $vr1,   S2,   0x00
+    vld       $vr2,   S3,   0x00
+    vld       $vr3,   S4,   0x00
+
+    vst       $vr0,   S9,   0x00
+    vst       $vr1,   S9,   0x10
+    vst       $vr2,   S9,   0x20
+    vst       $vr3,   S9,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     S9,   S9,   0x40
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   S10,   0x00
+    fst.s      F1,   S10,   0x04
+    fst.s      F2,   S10,   0x08
+    fst.s      F3,   S10,   0x0c
+    fst.s      F4,   S10,   0x10
+    fst.s      F5,   S10,   0x14
+    fst.s      F6,   S10,   0x18
+    fst.s      F7,   S10,   0x1c
+
+    addi.d     S10,  S10,   0x20
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x40
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     S8,    0x00
+    xvst       U1,     S8,    0x20
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       $vr0,     S1,    0x00
+    vld       $vr1,     S2,    0x00
+
+    vst       $vr0,     S9,    0x00
+    vst       $vr1,     S9,    0x10
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S9,     S9,    0x20
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    fld.s      F0,      S1,    0x00
+    fld.s      F1,      S1,    0x04
+
+    fld.s      F2,      S2,    0x00
+    fld.s      F3,      S2,    0x04
+
+    fst.s      F0,      S10,   0x00
+    fst.s      F1,      S10,   0x04
+    fst.s      F2,      S10,   0x08
+    fst.s      F3,      S10,   0x0c
+
+    addi.d     S10,    S10,   0x10
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     S8,    0x00
+
+    addi.d     S1,     S1,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    vld       $vr0,     S1,    0x00
+
+    vst       $vr0,     S9,    0x00
+
+    addi.d     S1,     S1,    0x10
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fst.s      F0,     S10,    0x00
+    fst.s      F1,     S10,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_4_lsx.S b/kernel/loongarch64/cgemm_tcopy_4_lsx.S
new file mode 100644
index 000000000..6d63d62e7
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_4_lsx.S
@@ -0,0 +1,324 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.w      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x02
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vst       U0,   S8,   0x00
+    vst       U1,   S8,   0x10
+    vst       U2,   S8,   0x20
+    vst       U3,   S8,   0x30
+    vst       U4,   S8,   0x40
+    vst       U5,   S8,   0x50
+    vst       U6,   S8,   0x60
+    vst       U7,   S8,   0x70
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    slli.d     T0,   M,    0x05
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S2,   0x00
+    vld       U2,   S3,   0x00
+    vld       U3,   S4,   0x00
+
+    vst       U0,   S9,   0x00
+    vst       U1,   S9,   0x10
+    vst       U2,   S9,   0x20
+    vst       U3,   S9,   0x30
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     S9,   S9,   0x40
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fld.s      F4,   S3,   0x00
+    fld.s      F5,   S3,   0x04
+
+    fld.s      F6,   S4,   0x00
+    fld.s      F7,   S4,   0x04
+
+    fst.s      F0,   S10,   0x00
+    fst.s      F1,   S10,   0x04
+    fst.s      F2,   S10,   0x08
+    fst.s      F3,   S10,   0x0c
+    fst.s      F4,   S10,   0x10
+    fst.s      F5,   S10,   0x14
+    fst.s      F6,   S10,   0x18
+    fst.s      F7,   S10,   0x1c
+
+    addi.d     S10,  S10,   0x20
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x40
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+    vst       U2,     S8,    0x20
+    vst       U3,     S8,    0x30
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+
+    vst       U0,     S9,    0x00
+    vst       U1,     S9,    0x10
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S9,     S9,    0x20
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    fld.s      F0,      S1,    0x00
+    fld.s      F1,      S1,    0x04
+
+    fld.s      F2,      S2,    0x00
+    fld.s      F3,      S2,    0x04
+
+    fst.s      F0,      S10,   0x00
+    fst.s      F1,      S10,   0x04
+    fst.s      F2,      S10,   0x08
+    fst.s      F3,      S10,   0x0c
+
+    addi.d     S10,    S10,   0x10
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+
+    addi.d     S1,     S1,    0x20
+    slli.d     T0,     M,     0x05
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    vld       U0,     S1,    0x00
+
+    vst       U0,     S9,    0x00
+
+    addi.d     S1,     S1,    0x10
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fst.s      F0,     S10,    0x00
+    fst.s      F1,     S10,    0x04
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemm_tcopy_8_lsx.S b/kernel/loongarch64/cgemm_tcopy_8_lsx.S
new file mode 100644
index 000000000..2935bbc07
--- /dev/null
+++ b/kernel/loongarch64/cgemm_tcopy_8_lsx.S
@@ -0,0 +1,277 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x02  //lda
+    slli.d     TL,     TL,    0x01
+
+    srai.d     J,      N,     0x03  //j
+
+    beq        J,      ZERO,  .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x40
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_J1M1
+
+.L_J1I1:  /* if(i>0) i--*/
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+    vst       U2,     TD,    0x20
+    vst       U3,     TD,    0x30
+    vst       U4,     TD,    0x40
+    vst       U5,     TD,    0x50
+    vst       U6,     TD,    0x60
+    vst       U7,     TD,    0x70
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x80
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_J1I1
+
+.L_J1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_J0
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+    vst       U2,     TD,    0x20
+    vst       U3,     TD,    0x30
+
+    addi.d     TD,     TD,    0x40
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO,   J,     .L_J1
+
+.L_N1:  /* if(n&4) */
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N2
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x20
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N1M1
+
+.L_N1I1:   /* if(i>0) i-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+    vst       U2,     TD,    0x20
+    vst       U3,     TD,    0x30
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x40
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N1I1
+
+.L_N1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+
+    addi.d     TD,     TD,    0x20
+
+.L_N2:  /* if(n&2) */
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N3
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x10
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N2M1
+
+.L_N2I1:  /* if(i>0) i-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+
+    vst       U0,     TD,    0x00
+    vst       U1,     TD,    0x10
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+
+    addi.d     TD,     TD,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N2I1
+
+.L_N2M1:   /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N3
+
+    vld       U0,     S1,    0x00
+
+    vst       U0,     TD,    0x00
+
+    addi.d     TD,     TD,    0x10
+
+.L_N3:   /* if(n&1) */
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N3M1
+
+.L_N3I1:  /* if(i>0) i-- */
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fld.s      F2,     S2,    0x00
+    fld.s      F3,     S2,    0x04
+
+    fst.s      F0,     TD,    0x00
+    fst.s      F1,     TD,    0x04
+    fst.s      F2,     TD,    0x08
+    fst.s      F3,     TD,    0x0c
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x10
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N3I1
+
+.L_N3M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.s      F0,     S1,    0x00
+    fld.s      F1,     S1,    0x04
+
+    fst.s      F0,     TD,    0x00
+    fst.s      F1,     TD,    0x04
+
+.L_N0:
+    LDARG      $r23,   $sp,   0
+    addi.d     $sp,    $sp,   8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/cgemv_n_4_lsx.S b/kernel/loongarch64/cgemv_n_4_lsx.S
new file mode 100644
index 000000000..cf8273797
--- /dev/null
+++ b/kernel/loongarch64/cgemv_n_4_lsx.S
@@ -0,0 +1,323 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M8      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $vr1
+#define X0      $vr2
+#define X1      $vr3
+#define X2      $vr4
+#define X3      $vr5
+#define X4      $vr6
+#define X5      $vr7
+#define X6      $vr8
+#define X7      $vr9
+#define Y0      $vr10
+#define Y1      $vr11
+#define A0      $vr12
+#define A1      $vr13
+#define A2      $vr14
+#define A3      $vr15
+#define A4      $vr16
+#define A5      $vr17
+#define A6      $vr18
+#define A7      $vr19
+#define A8      $vr20
+#define A9      $vr21
+#define A10     $vr22
+#define A11     $vr23
+#define A12     $vr24
+#define A13     $vr25
+#define A14     $vr26
+#define A15     $vr27
+#define TMP0    $vr28
+#define TMP1    $vr29
+#define TMP2    $vr30
+
+#if !defined(CONJ)
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  0
+#else
+#define GXCONJ 1
+#define GCONJ  0
+#endif
+#else
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  1
+#else
+#define GXCONJ 1
+#define GCONJ  1
+#endif
+#endif
+
+.macro CLOAD_X_4
+    GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
+    GCOMPLEXMUL GXCONJ, \
+    vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_X_4_GAP
+    vldrepl.d   X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    vldrepl.d   X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    vldrepl.d   X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    vldrepl.d   X3,     T0,     0x00
+
+    GCOMPLEXMUL GXCONJ, \
+    vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_X_1
+    GLDREPL v, d, X0, X, 0x00
+    GCOMPLEXMUL GXCONJ, \
+    vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_Y_4
+    GLD v, , Y0, Y, 0, Y1, Y, 0x10
+.endm
+
+.macro CLOAD_Y_4_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f11,   T0, 0
+    fldx.d  $f17,   T0, INC_Y
+    vpackev.d Y0, A1, Y0
+    vpackev.d Y1, A5, Y1
+.endm
+
+.macro CLOAD_Y_1
+    fld.d  $f10,   Y,  0
+.endm
+
+.macro CSTORE_Y_4
+    GST v, , Y0, Y, 0, Y1, Y, 0x10
+.endm
+
+.macro CSTORE_Y_4_GAP
+    vstelm.d    Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    vstelm.d    Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    vstelm.d    Y1,     T0,     0,      0
+    PTR_ADD     T0,     T0,     INC_Y
+    vstelm.d    Y1,     T0,     0,      1
+.endm
+
+.macro CSTORE_Y_1
+    fst.d   $f10,   Y,  0
+.endm
+
+.macro CGEMV_N_4x4
+    GLD_INC v, , 0x10,        \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, s, Y0, X0, A0,  Y0, TMP0, TMP1, TMP2, Y1, X0, A1,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, Y1, X1, A3,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, Y1, X2, A5,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, Y1, X3, A7,  Y1, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_N_1x4
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, s, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_N_1x1
+    fld.d   $f12,    PA0,    0
+    PTR_ADDI PA0,   PA0,    0x08
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      2
+    beqz      J,      .L_\XW\()_N_3
+    PTR_SLLI  K_LDA,  LDA,    2
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L4:
+    CLOAD_\X_4
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       2
+    beqz      I,      .L_\XW\()_M_3
+.align 5
+.L_\XW\()_M_L4:
+    CLOAD_\Y_4
+    CGEMV_N_4x4
+    CSTORE_\Y_4
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    PTR_ADDI    K,      K,      4
+    bnez        I,      .L_\XW\()_M_L4
+.L_\XW\()_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    CLOAD_\Y_1
+    CGEMV_N_1x4
+    CSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+    bnez        J,      .L_\XW\()_N_L4
+.L_\XW\()_N_3:
+    andi        J,      N,      3
+    beqz        J,      .L_END
+.L_\XW\()_N_L1:
+    CLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    CLOAD_\Y_1
+    CGEMV_N_1x1
+    CSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI    J,      J,      -1
+    PTR_SUB     K_LDA,  LDA,    M8
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     X,      X,      INC_X
+    bnez        J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 31
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    // Init VALPHA
+    vpackev.w      $vr0,   $vr1,   $vr0
+    vpackev.d      VALPHA, $vr0,   $vr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0 // Obtain the offset address
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 31
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/cgemv_n_8_lasx.S b/kernel/loongarch64/cgemv_n_8_lasx.S
new file mode 100644
index 000000000..ba38a9573
--- /dev/null
+++ b/kernel/loongarch64/cgemv_n_8_lasx.S
@@ -0,0 +1,383 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2024/02/20 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M8      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define Y1      $xr11
+#define A0      $xr12
+#define A1      $xr13
+#define A2      $xr14
+#define A3      $xr15
+#define A4      $xr16
+#define A5      $xr17
+#define A6      $xr18
+#define A7      $xr19
+#define A8      $xr20
+#define A9      $xr21
+#define A10     $xr22
+#define A11     $xr23
+#define A12     $xr24
+#define A13     $xr25
+#define A14     $xr26
+#define A15     $xr27
+#define TMP0    $xr28
+#define TMP1    $xr29
+#define TMP2    $xr30
+
+#if !defined(CONJ)
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  0
+#else
+#define GXCONJ 1
+#define GCONJ  0
+#endif
+#else
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  1
+#else
+#define GXCONJ 1
+#define GCONJ  1
+#endif
+#endif
+
+.macro CLOAD_X_8
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
+                   X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
+    GCOMPLEXMUL GXCONJ, \
+    xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2, \
+    X4, VALPHA, X4, TMP0, TMP1, TMP2, \
+    X5, VALPHA, X5, TMP0, TMP1, TMP2, \
+    X6, VALPHA, X6, TMP0, TMP1, TMP2, \
+    X7, VALPHA, X7, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_X_8_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X4,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X5,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X6,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X7,     T0,     0x00
+
+    GCOMPLEXMUL GXCONJ, \
+    xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2, \
+    X4, VALPHA, X4, TMP0, TMP1, TMP2, \
+    X5, VALPHA, X5, TMP0, TMP1, TMP2, \
+    X6, VALPHA, X6, TMP0, TMP1, TMP2, \
+    X7, VALPHA, X7, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_Y_8
+    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro CLOAD_Y_8_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  2
+    fld.d   $f11,   T0, 0
+    fldx.d  $f17,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    fld.d   $f18,   T0, 0
+    fldx.d  $f19,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
+.endm
+
+.macro CSTORE_Y_8_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      0
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      3
+.endm
+
+.macro CGEMV_N_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, s, Y0, X0, A0,  Y0, TMP0, TMP1, TMP2, Y1, X0, A1,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, Y1, X1, A3,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, Y1, X2, A5,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, Y1, X3, A7,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X4, A8,  Y0, TMP0, TMP1, TMP2, Y1, X4, A9,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \
+    Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \
+    Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2
+.endm
+
+.macro CSTORE_Y_8
+    GST xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro CLOAD_X_1
+    GLDREPL xv, d, X0, X, 0x00
+    GCOMPLEXMUL GXCONJ, \
+    xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
+.endm
+
+.macro CLOAD_Y_1
+    fld.d  $f10,   Y,  0
+.endm
+
+.macro CGEMV_N_1x8
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
+                        $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, s, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X4, A8,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \
+    Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \
+    Y0, X7, A14, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro CSTORE_Y_1
+    fst.d   $f10,   Y,  0
+.endm
+
+.macro CGEMV_N_1x1
+    fld.d   $f12,    PA0,    0
+    PTR_ADDI PA0,   PA0,    0x08
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    CLOAD_\X_8
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    CLOAD_\Y_8
+    CGEMV_N_8x8
+    CSTORE_\Y_8
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    PTR_ADDI    K,      K,      8
+    bnez        I,      .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    CLOAD_\Y_1
+    CGEMV_N_1x8
+    CSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      7
+    beqz        J,      .L_END
+.L_\XW\()_N_L1:
+    CLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    CLOAD_\Y_1
+    CGEMV_N_1x1
+    CSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI    J,      J,      -1
+    PTR_SUB     K_LDA,  LDA,    M8
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     X,      X,      INC_X
+    bnez        J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 31
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    // Init VALPHA
+    xvpackev.w      $xr0,   $xr1,   $xr0
+    xvreplve0.d     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0 // Obtain the offset address
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 31
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/cgemv_t_4_lsx.S b/kernel/loongarch64/cgemv_t_4_lsx.S
new file mode 100644
index 000000000..ada349364
--- /dev/null
+++ b/kernel/loongarch64/cgemv_t_4_lsx.S
@@ -0,0 +1,290 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M8      $r30
+
+#define VALPHA  $vr0
+#define X0      $vr1
+#define X1      $vr2
+#define A0      $vr3
+#define A1      $vr4
+#define A2      $vr5
+#define A3      $vr6
+#define A4      $vr7
+#define A5      $vr8
+#define A6      $vr9
+#define A7      $vr10
+#define A8      $vr11
+#define A9      $vr12
+#define A10     $vr13
+#define A11     $vr14
+#define A12     $vr15
+#define A13     $vr16
+#define A14     $vr17
+#define A15     $vr18
+#define TP0     $vr19
+#define TP1     $vr20
+#define TP2     $vr21
+#define TP3     $vr22
+#define TP4     $vr23
+#define TP5     $vr24
+#define TP6     $vr25
+#define TP7     $vr26
+#define TMP0    $vr27
+#define TMP1    $vr28
+#define TMP2    $vr29
+#define Y0      $vr3
+#define Y1      $vr4
+#define Y2      $vr5
+#define Y3      $vr6
+#define Y4      $vr7
+#define Y5      $vr8
+#define Y6      $vr9
+#define Y7      $vr10
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define GXCONJ1 0
+#define GCONJ1  0
+#else
+#define GXCONJ1 1
+#define GCONJ1  0
+#endif
+
+#if !defined(XCONJ)
+#define GXCONJ2 0
+#define GCONJ2  0
+#else
+#define GXCONJ2 0
+#define GCONJ2  1
+#endif
+
+.macro ZERO_Y4
+    GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y1
+    GXOR v, v, TP0, TP0, TP0
+.endm
+
+.macro CLOAD_X4
+    GLD v, , X0, X, 0x00, X1, X, 0x10
+.endm
+
+.macro CLOAD_X4_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f3,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f2,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    vpackev.d   X0, A0, X0
+    vpackev.d   X1, A1, X1
+.endm
+
+.macro CGEMV_T_4x4
+    GLD_INC v, , 0x10,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, s, TP0, A0,  X0, TP0, TMP0, TMP1, TMP2, TP0, A1,  X1, TP0, TMP0, TMP1, TMP2, \
+    TP1, A2,  X0, TP1, TMP0, TMP1, TMP2, TP1, A3,  X1, TP1, TMP0, TMP1, TMP2, \
+    TP2, A4,  X0, TP2, TMP0, TMP1, TMP2, TP2, A5,  X1, TP2, TMP0, TMP1, TMP2, \
+    TP3, A6,  X0, TP3, TMP0, TMP1, TMP2, TP3, A7,  X1, TP3, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_T_LSX XW:req, X4:req
+    PTR_SRLI  J,      N,      2
+    beqz      J,      .L_\XW\()_N_3
+    PTR_SLLI  K_LDA,  LDA,    2
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L4:
+    ZERO_Y4
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       2
+    beqz      I,      .L_\XW\()_M_3
+.align 5
+.L_\XW\()_M_L4:
+    CLOAD_\X4
+    CGEMV_T_4x4
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  2
+    bnez        I,  .L_\XW\()_M_L4
+.L_\XW\()_M_3:
+    // Accumulated
+    GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.d   $f1,    X,      0x00
+    fld.d   $f11,   PA0,    0x00
+    fld.d   $f12,   PA1,    0x00
+    fld.d   $f13,   PA2,    0x00
+    fld.d   $f14,   PA3,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
+#else
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
+#endif
+
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, s, A0, A8,  X0, A0, TMP0, TMP1, TMP2, A1, A9,  X0, A1, TMP0, TMP1, TMP2, \
+    A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
+
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    vf, s, A8,  VALPHA, A0, A8,  TMP0, TMP1, TMP2, A9,  VALPHA, A1, A9,  TMP0, TMP1, TMP2,\
+    A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    bnez        J,      .L_\XW\()_N_L4
+.L_\XW\()_N_3:
+    andi        J,      N,      3
+    beqz        J,      .L_END
+    PTR_SUB     K_LDA,  LDA,    M8
+.L_\XW\()_N_1:
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.d   $f3,    PA0,    0x00
+    fld.d   $f1,    X,      0x00
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x08
+    bnez      I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI  J,      J,      -1
+    fld.d     $f3,    Y,      0x00
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
+    fst.d     $f3,    Y,      0x00
+    PTR_ADD   PA0,    PA0,    K_LDA
+    PTR_ADD   Y,      Y,      INC_Y
+    bnez      J,      .L_\XW\()_N_1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 30
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    // Init VALPHA
+    vpackev.w      $vr0,   $vr1,   $vr0
+    vpackev.d      VALPHA, $vr0,   $vr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    CGEMV_T_LSX GAP_0, X4
+.L_GAP_1: /* if (incx != 1) */
+    CGEMV_T_LSX GAP_1, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 30
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/cgemv_t_8_lasx.S b/kernel/loongarch64/cgemv_t_8_lasx.S
new file mode 100644
index 000000000..94e4bd2eb
--- /dev/null
+++ b/kernel/loongarch64/cgemv_t_8_lasx.S
@@ -0,0 +1,342 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2022/02/20 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M8      $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define X1      $xr2
+#define A0      $xr3
+#define A1      $xr4
+#define A2      $xr5
+#define A3      $xr6
+#define A4      $xr7
+#define A5      $xr8
+#define A6      $xr9
+#define A7      $xr10
+#define A8      $xr11
+#define A9      $xr12
+#define A10     $xr13
+#define A11     $xr14
+#define A12     $xr15
+#define A13     $xr16
+#define A14     $xr17
+#define A15     $xr18
+#define TP0     $xr19
+#define TP1     $xr20
+#define TP2     $xr21
+#define TP3     $xr22
+#define TP4     $xr23
+#define TP5     $xr24
+#define TP6     $xr25
+#define TP7     $xr26
+#define TMP0    $xr27
+#define TMP1    $xr28
+#define TMP2    $xr29
+#define Y0      $xr3
+#define Y1      $xr4
+#define Y2      $xr5
+#define Y3      $xr6
+#define Y4      $xr7
+#define Y5      $xr8
+#define Y6      $xr9
+#define Y7      $xr10
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define GXCONJ1 0
+#define GCONJ1  0
+#else
+#define GXCONJ1 1
+#define GCONJ1  0
+#endif
+
+#if !defined(XCONJ)
+#define GXCONJ2 0
+#define GCONJ2  0
+#else
+#define GXCONJ2 0
+#define GCONJ2  1
+#endif
+
+.macro ZERO_Y8
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
+                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro CLOAD_X8
+    GLD xv, , X0, X, 0x00, X1, X, 0x20
+.endm
+
+.macro CLOAD_X8_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+    PTR_ALSL    T0,     INC_X,      X,      2
+    fld.d       $f2,    T0,   0x00
+    fldx.d      $f3,    T0,   INC_X
+    PTR_ALSL    T0,     INC_X,      T0,     1
+    fld.d       $f4,    T0,   0x00
+    fldx.d      $f5,    T0,   INC_X
+    GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
+.endm
+
+.macro CGEMV_T_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, s, TP0, A0,  X0, TP0, TMP0, TMP1, TMP2, TP0, A1,  X1, TP0, TMP0, TMP1, TMP2, \
+    TP1, A2,  X0, TP1, TMP0, TMP1, TMP2, TP1, A3,  X1, TP1, TMP0, TMP1, TMP2, \
+    TP2, A4,  X0, TP2, TMP0, TMP1, TMP2, TP2, A5,  X1, TP2, TMP0, TMP1, TMP2, \
+    TP3, A6,  X0, TP3, TMP0, TMP1, TMP2, TP3, A7,  X1, TP3, TMP0, TMP1, TMP2, \
+    TP4, A8,  X0, TP4, TMP0, TMP1, TMP2, TP4, A9,  X1, TP4, TMP0, TMP1, TMP2, \
+    TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \
+    TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \
+    TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2
+.endm
+
+.macro CGEMV_T_LASX XW:req, X8:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    ZERO_Y8
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    CLOAD_\X8
+    CGEMV_T_8x8
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  3
+    bnez        I,  .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    // Accumulated
+    GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
+                        Y5, TP5, Y6, TP6, Y7, TP7
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.d   $f1,    X,      0x00
+    fld.d   $f11,   PA0,    0x00
+    fld.d   $f12,   PA1,    0x00
+    fld.d   $f13,   PA2,    0x00
+    fld.d   $f14,   PA3,    0x00
+    fld.d   $f15,   PA4,    0x00
+    fld.d   $f16,   PA5,    0x00
+    fld.d   $f17,   PA6,    0x00
+    fld.d   $f18,   PA7,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#else
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#endif
+
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, s, A0, A8,  X0, A0, TMP0, TMP1, TMP2, A1, A9,  X0, A1, TMP0, TMP1, TMP2, \
+    A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \
+    A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \
+    A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2
+
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    PY1, INC_Y,  Y,  2
+    fld.d   $f15,   PY1,    0x00
+    fldx.d  $f16,   PY1,    INC_Y
+    PTR_ALSL    PY2, INC_Y,  PY1, 1
+    fld.d   $f17,   PY2,    0x00
+    fldx.d  $f18,   PY2,    INC_Y
+
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    xvf, s, A8,  VALPHA, A0, A8,  TMP0, TMP1, TMP2, A9,  VALPHA, A1, A9,  TMP0, TMP1, TMP2,\
+    A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\
+    A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\
+    A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    fst.d   $f15,   PY1,    0x00
+    fstx.d  $f16,   PY1,    INC_Y
+    fst.d   $f17,   PY2,    0x00
+    fstx.d  $f18,   PY2,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      7
+    beqz        J,      .L_END
+    PTR_SUB     K_LDA,  LDA,    M8
+.L_\XW\()_N_1:
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.d   $f3,    PA0,    0x00
+    fld.d   $f1,    X,      0x00
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x08
+    bnez      I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI  J,      J,      -1
+    fld.d     $f3,    Y,      0x00
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
+    fst.d     $f3,    Y,      0x00
+    PTR_ADD   PA0,    PA0,    K_LDA
+    PTR_ADD   Y,      Y,      INC_Y
+    bnez      J,      .L_\XW\()_N_1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 30
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    // Init VALPHA
+    xvpackev.w      $xr0,   $xr1,   $xr0
+    xvreplve0.d     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    CGEMV_T_LASX GAP_0, X8
+.L_GAP_1: /* if (incx != 1) */
+    CGEMV_T_LASX GAP_1, X8_GAP
+.L_END:
+    pop_if_used 17 + 8, 30
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S
index 9d27987e1..41667485a 100644
--- a/kernel/loongarch64/cnrm2.S
+++ b/kernel/loongarch64/cnrm2.S
@@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    fmov.d s2, s1
    bge $r0,    N, .L999
    slli.d INCX, INCX, ZBASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    srai.d  I, N, 2
    bge $r0,    I, .L25
    LD a1,  X,   0 * SIZE
diff --git a/kernel/loongarch64/cnrm2_lasx.S b/kernel/loongarch64/cnrm2_lasx.S
new file mode 100644
index 000000000..3a60069ac
--- /dev/null
+++ b/kernel/loongarch64/cnrm2_lasx.S
@@ -0,0 +1,147 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define a1     $f15
+#define a2     $f16
+#define res    $f19
+#define VX0    $xr15
+#define VX1    $xr16
+#define VX2    $xr17
+#define VX3    $xr18
+#define VX4    $xr21
+#define res1   $xr19
+#define res2   $xr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   xvxor.v res1, res1, res1
+   xvxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, ZBASE_SHIFT
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   xvld VX0, X, 0 * SIZE
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0, I, .L10
+   .align 3
+   b .L996
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   add.d X, X, INCX
+   xvinsgr2vr.w VX0, t1, 0
+   xvinsgr2vr.w VX0, t2, 1
+   xvinsgr2vr.w VX0, t3, 2
+   xvinsgr2vr.w VX0, t4, 3
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   xvinsgr2vr.w VX0, t1, 4
+   xvinsgr2vr.w VX0, t2, 5
+   xvinsgr2vr.w VX0, t3, 6
+   xvinsgr2vr.w VX0, t4, 7
+   add.d X, X, INCX
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+
+.L996:
+   xvfadd.d res1, res1, res2
+   xvpickve.d VX1, res1, 1
+   xvpickve.d VX2, res1, 2
+   xvpickve.d VX3, res1, 3
+   xvfadd.d res1, VX1, res1
+   xvfadd.d res1, VX2, res1
+   xvfadd.d res1, VX3, res1
+   .align 3
+
+.L997:
+   andi I, N, 3
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+   fld.s a1, X, 0 * SIZE
+   fld.s a2, X, 1 * SIZE
+   addi.d I, I, -1
+   fcvt.d.s a1, a1
+   fcvt.d.s a2, a2
+   fmadd.d res, a1, a1, res
+   fmadd.d res, a2, a2, res
+   add.d X, X, INCX
+   blt $r0, I, .L998
+   .align 3
+
+.L999:
+   fsqrt.d res, res
+   move $r4, $r17
+   fcvt.s.d $f0, res
+   jirl $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/cnrm2_lsx.S b/kernel/loongarch64/cnrm2_lsx.S
new file mode 100644
index 000000000..20950ba17
--- /dev/null
+++ b/kernel/loongarch64/cnrm2_lsx.S
@@ -0,0 +1,155 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define a1     $f15
+#define a2     $f16
+#define res    $f19
+#define VX0    $vr15
+#define VX1    $vr16
+#define VX2    $vr17
+#define VX3    $vr18
+#define VX4    $vr21
+#define res1   $vr19
+#define res2   $vr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   vxor.v res1, res1, res1
+   vxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, 1
+   slli.d TEMP, TEMP, ZBASE_SHIFT
+   slli.d INCX, INCX, ZBASE_SHIFT
+   srai.d I, N, 2
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   vld VX0, X, 0 * SIZE
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   vld VX0, X, 4 * SIZE
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0, I, .L10
+   b .L996
+   .align 3
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   add.d X, X, INCX
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   ld.w t1, X, 0 * SIZE
+   ld.w t2, X, 1 * SIZE
+   add.d X, X, INCX
+   ld.w t3, X, 0 * SIZE
+   ld.w t4, X, 1 * SIZE
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   add.d X, X, INCX
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+   .align 3
+
+.L996:
+   vfadd.d res1, res1, res2
+   vreplvei.d VX1, res1, 1
+   vfadd.d res1, VX1, res1
+   .align 3
+
+.L997:
+   andi I, N, 3
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+    fld.s a1, X, 0 * SIZE
+    fld.s a2, X, 1 * SIZE
+    addi.d I, I, -1
+    fcvt.d.s a1, a1
+    fcvt.d.s a2, a2
+    fmadd.d res, a1, a1, res
+    fmadd.d res, a2, a2, res
+    add.d X, X, INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d res, res
+    move $r4, $r17
+    fcvt.s.d $f0, $f19
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/copy_lasx.S b/kernel/loongarch64/copy_lasx.S
new file mode 100644
index 000000000..31f91cec1
--- /dev/null
+++ b/kernel/loongarch64/copy_lasx.S
@@ -0,0 +1,306 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0
+    addi.d  I, I, -1
+    xvst VX0, Y, 0
+#ifdef DOUBLE
+    xvld VX0, X, 32
+    xvst VX0, Y, 32
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0, 3
+    add.d Y, Y, INCY
+#else
+    xvld VX0, X, 0
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0, 7
+    add.d Y, Y, INCY
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvst VX0, Y, 0
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvst VX1, Y, 32
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvst VX0, Y, 0
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY!=1 */
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/copy_lsx.S b/kernel/loongarch64/copy_lsx.S
new file mode 100644
index 000000000..bb10f3565
--- /dev/null
+++ b/kernel/loongarch64/copy_lsx.S
@@ -0,0 +1,316 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d  I, I, -1
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0, 3
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0, 3
+    add.d Y, Y, INCY
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    ST    $f12, Y, 0
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vst VX0, Y, 0
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX1, Y, 16
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY!=1 */
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    ST    a3, X, 0
+    add.d Y, Y, INCY
+    ST    a4, X, 0
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD    $f12, X, 0
+    addi.d I, I, -1
+    ST    $f12, Y, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/crot_lasx.S b/kernel/loongarch64/crot_lasx.S
new file mode 100644
index 000000000..d4ec1e22c
--- /dev/null
+++ b/kernel/loongarch64/crot_lasx.S
@@ -0,0 +1,1079 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VT0    $xr10
+#define VT1    $xr18
+#define VXC    $xr23
+#define VXS    $xr9
+#define VXZ    $xr11
+#define x1     $xr12
+#define x2     $xr13
+#define x3     $xr14
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, C
+    MTG  t2, S
+    MTG  t3, a1
+#ifdef DOUBLE
+    xvreplgr2vr.d VXC, t1
+    xvreplgr2vr.d VXS, t2
+    xvreplgr2vr.d VXZ, t3
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXC, t1
+    xvreplgr2vr.w VXS, t2
+    xvreplgr2vr.w VXZ, t3
+    srai.d I, N, 3
+#endif
+    beq INCX, $r0, .L996
+    beq INCY, $r0, .L996
+    bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1
+    bne INCY, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, x1, VXC
+    xvfmadd.d VX0, x3, VXS, VX0
+    xvfmul.d VX1, x1, VXS
+    xvfmsub.d VX1, x3, VXC, VX1
+    xvfmul.d VX2, x2, VXC
+    xvfmadd.d VX2, x4, VXS, VX2
+    xvfmul.d VX3, x2, VXS
+    xvfmsub.d VX3, x4, VXC, VX3
+    xvilvl.d x1, VX2 ,VX0
+    xvilvh.d x2, VX2, VX0
+    xvilvl.d x3, VX3 ,VX1
+    xvilvh.d x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 4 * SIZE
+    xvst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmul.s VX0, x1, VXC
+    xvfmadd.s VX0, x3, VXS, VX0
+    xvfmul.s VX1, x1, VXS
+    xvfmsub.s VX1, x3, VXC, VX1
+    xvfmul.s VX2, x2, VXC
+    xvfmadd.s VX2, x4, VXS, VX2
+    xvfmul.s VX3, x2, VXS
+    xvfmsub.s VX3, x4, VXC, VX3
+    xvilvl.w x1, VX2 ,VX0
+    xvilvh.w x2, VX2, VX0
+    xvilvl.w x3, VX3 ,VX1
+    xvilvh.w x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 8 * SIZE
+    xvst x4, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, x1, VXC
+    xvfmul.d VX1, x3, VXC
+    xvfmul.d VX2, x2, VXC
+    xvfmul.d VX3, x4, VXC
+    xvilvl.d x1, VX2 ,VX0
+    xvilvh.d x2, VX2, VX0
+    xvilvl.d x3, VX3 ,VX1
+    xvilvh.d x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 4 * SIZE
+    xvst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmul.s VX0, x1, VXC
+    xvfmul.s VX1, x3, VXC
+    xvfmul.s VX2, x2, VXC
+    xvfmul.s VX3, x4, VXC
+    xvilvl.w x1, VX2 ,VX0
+    xvilvh.w x2, VX2, VX0
+    xvilvl.w x3, VX3 ,VX1
+    xvilvh.w x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 8 * SIZE
+    xvst x4, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmul.d VX0, x3, VXS
+    xvfmul.d VX1, x1, VXS
+    xvfsub.d VX1, VXZ, VX1
+    xvfmul.d VX2, x4, VXS
+    xvfmul.d VX3, x2, VXS
+    xvfsub.d VX3, VXZ, VX3
+    xvilvl.d x1, VX2 ,VX0
+    xvilvh.d x2, VX2, VX0
+    xvilvl.d x3, VX3 ,VX1
+    xvilvh.d x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 4 * SIZE
+    xvst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmul.s VX0, x3, VXS
+    xvfmul.s VX1, x1, VXS
+    xvfsub.s VX1, VXZ, VX1
+    xvfmul.s VX2, x4, VXS
+    xvfmul.s VX3, x2, VXS
+    xvfsub.s VX3, VXZ, VX3
+    xvilvl.w x1, VX2 ,VX0
+    xvilvh.w x2, VX2, VX0
+    xvilvl.w x3, VX3 ,VX1
+    xvilvh.w x4, VX3, VX1
+    xvst x1, X, 0 * SIZE
+    xvst x3, Y, 0 * SIZE
+    xvst x2, X, 8 * SIZE
+    xvst x4, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 4 * SIZE
+    xvst VXZ, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvst VXZ, X, 8 * SIZE
+    xvst VXZ, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+
+    xvfmul.d VX0, x1, VXC
+    xvfmadd.d VX0, x3, VXS, VX0
+    xvfmul.d VX1, x1, VXS
+    xvfmsub.d VX1, x3, VXC, VX1
+    xvfmul.d VX2, x2, VXC
+    xvfmadd.d VX2, x4, VXS, VX2
+    xvfmul.d VX3, x2, VXS
+    xvfmsub.d VX3, x4, VXC, VX3
+    xvstelm.d VX0, XX, 0, 0
+    xvstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 1
+    xvstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 2
+    xvstelm.d VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 3
+    xvstelm.d VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.d VX1, YY, 0, 0
+    xvstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    xvstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    xvstelm.d VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    xvstelm.d VX3, YY, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+
+    xvfmul.s VX0, x1, VXC
+    xvfmadd.s VX0, x3, VXS, VX0
+    xvfmul.s VX1, x1, VXS
+    xvfmsub.s VX1, x3, VXC, VX1
+    xvfmul.s VX2, x2, VXC
+    xvfmadd.s VX2, x4, VXS, VX2
+    xvfmul.s VX3, x2, VXS
+    xvfmsub.s VX3, x4, VXC, VX3
+    xvstelm.w VX0, XX, 0, 0
+    xvstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 1
+    xvstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 2
+    xvstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 3
+    xvstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 0
+    xvstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 1
+    xvstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 2
+    xvstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 3
+    xvstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, XX, 0, 4
+    xvstelm.w VX2, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 5
+    xvstelm.w VX2, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 6
+    xvstelm.w VX2, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 7
+    xvstelm.w VX2, XX, 1 * SIZE, 7
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 4
+    xvstelm.w VX3, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 5
+    xvstelm.w VX3, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 6
+    xvstelm.w VX3, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 7
+    xvstelm.w VX3, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, x1, VXC
+    xvfmul.d VX1, x3, VXC
+    xvfmul.d VX2, x2, VXC
+    xvfmul.d VX3, x4, VXC
+    xvstelm.d VX0, XX, 0, 0
+    xvstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 1
+    xvstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 2
+    xvstelm.d VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 3
+    xvstelm.d VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.d VX1, YY, 0, 0
+    xvstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    xvstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    xvstelm.d VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    xvstelm.d VX3, YY, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, x1, VXC
+    xvfmul.s VX1, x3, VXC
+    xvfmul.s VX2, x2, VXC
+    xvfmul.s VX3, x4, VXC
+    xvstelm.w VX0, XX, 0, 0
+    xvstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 1
+    xvstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 2
+    xvstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 3
+    xvstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 0
+    xvstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 1
+    xvstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 2
+    xvstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 3
+    xvstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, XX, 0, 4
+    xvstelm.w VX2, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 5
+    xvstelm.w VX2, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 6
+    xvstelm.w VX2, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 7
+    xvstelm.w VX2, XX, 1 * SIZE, 7
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 4
+    xvstelm.w VX3, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 5
+    xvstelm.w VX3, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 6
+    xvstelm.w VX3, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 7
+    xvstelm.w VX3, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VX0, x3, VXS
+    xvfmul.d VX1, x1, VXS
+    xvfsub.d VX1, VXZ, VX1
+    xvfmul.d VX2, x4, VXS
+    xvfmul.d VX3, x2, VXS
+    xvfsub.d VX3, VXZ, VX3
+    xvstelm.d VX0, XX, 0, 0
+    xvstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 1
+    xvstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 2
+    xvstelm.d VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d VX0, XX, 0, 3
+    xvstelm.d VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.d VX1, YY, 0, 0
+    xvstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 1
+    xvstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 2
+    xvstelm.d VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d VX1, YY, 0, 3
+    xvstelm.d VX3, YY, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, x3, VXS
+    xvfmul.s VX1, x1, VXS
+    xvfsub.s VX1, VXZ, VX1
+    xvfmul.s VX2, x4, VXS
+    xvfmul.s VX3, x2, VXS
+    xvfsub.s VX3, VXZ, VX3
+    xvstelm.w VX0, XX, 0, 0
+    xvstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 1
+    xvstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 2
+    xvstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 3
+    xvstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 0
+    xvstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 1
+    xvstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 2
+    xvstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 3
+    xvstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, XX, 0, 4
+    xvstelm.w VX2, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 5
+    xvstelm.w VX2, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 6
+    xvstelm.w VX2, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w VX0, XX, 0, 7
+    xvstelm.w VX2, XX, 1 * SIZE, 7
+    add.d XX, XX, INCX
+    xvstelm.w VX1, YY, 0, 4
+    xvstelm.w VX3, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 5
+    xvstelm.w VX3, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 6
+    xvstelm.w VX3, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX1, YY, 0, 7
+    xvstelm.w VX3, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 0
+    xvstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    xvstelm.d VXZ, YY, 1 * SIZE, 0
+#else
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 0
+    xvstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 0
+    xvstelm.w VXZ, YY, 1 * SIZE, 0
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    move X, XX
+    move Y, YY
+    b .L997
+    .align 3
+
+.L996:
+    move I, N
+    b .L998
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MUL  s1, a1, C
+    MADD s1, a3, S, s1
+    MUL  s2, a1, S
+    MSUB s2, a3, C, s2
+    MUL  s3, a2, C
+    MADD s3, a4, S, s3
+    MUL  s4, a2, S
+    MSUB s4, a4, C, s4
+    addi.d I, I, -1
+    ST   s1, X, 0 * SIZE
+    ST   s2, Y, 0 * SIZE
+    ST   s3, X, 1 * SIZE
+    ST   s4, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/crot_lsx.S b/kernel/loongarch64/crot_lsx.S
new file mode 100644
index 000000000..126257edc
--- /dev/null
+++ b/kernel/loongarch64/crot_lsx.S
@@ -0,0 +1,907 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VT0    $vr10
+#define VT1    $vr18
+#define VXC    $vr23
+#define VXS    $vr9
+#define VXZ    $vr11
+#define x1     $vr12
+#define x2     $vr13
+#define x3     $vr14
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, C
+    MTG  t2, S
+    MTG  t3, a1
+#ifdef DOUBLE
+    vreplgr2vr.d VXC, t1
+    vreplgr2vr.d VXS, t2
+    vreplgr2vr.d VXZ, t3
+#else
+    vreplgr2vr.w VXC, t1
+    vreplgr2vr.w VXS, t2
+    vreplgr2vr.w VXZ, t3
+    srai.d I, N, 2
+#endif
+    beq INCX, $r0, .L996
+    beq INCY, $r0, .L996
+    bne INCX, TEMP, .L22 // INCX!=1 or INCY!=1
+    bne INCY, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, x1, VXC
+    vfmadd.d VX0, x3, VXS, VX0
+    vfmul.d VX1, x1, VXS
+    vfmsub.d VX1, x3, VXC, VX1
+    vfmul.d VX2, x2, VXC
+    vfmadd.d VX2, x4, VXS, VX2
+    vfmul.d VX3, x2, VXS
+    vfmsub.d VX3, x4, VXC, VX3
+    vilvl.d x1, VX2 ,VX0
+    vilvh.d x2, VX2, VX0
+    vilvl.d x3, VX3 ,VX1
+    vilvh.d x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 2 * SIZE
+    vst x4, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, x1, VXC
+    vfmadd.s VX0, x3, VXS, VX0
+    vfmul.s VX1, x1, VXS
+    vfmsub.s VX1, x3, VXC, VX1
+    vfmul.s VX2, x2, VXC
+    vfmadd.s VX2, x4, VXS, VX2
+    vfmul.s VX3, x2, VXS
+    vfmsub.s VX3, x4, VXC, VX3
+    vilvl.w x1, VX2 ,VX0
+    vilvh.w x2, VX2, VX0
+    vilvl.w x3, VX3 ,VX1
+    vilvh.w x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 4 * SIZE
+    vst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, x1, VXC
+    vfmul.d VX1, x3, VXC
+    vfmul.d VX2, x2, VXC
+    vfmul.d VX3, x4, VXC
+    vilvl.d x1, VX2 ,VX0
+    vilvh.d x2, VX2, VX0
+    vilvl.d x3, VX3 ,VX1
+    vilvh.d x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 2 * SIZE
+    vst x4, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, x1, VXC
+    vfmul.s VX1, x3, VXC
+    vfmul.s VX2, x2, VXC
+    vfmul.s VX3, x4, VXC
+    vilvl.w x1, VX2 ,VX0
+    vilvh.w x2, VX2, VX0
+    vilvl.w x3, VX3 ,VX1
+    vilvh.w x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 4 * SIZE
+    vst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmul.d VX0, x3, VXS
+    vfmul.d VX1, x1, VXS
+    vfsub.d VX1, VXZ, VX1
+    vfmul.d VX2, x4, VXS
+    vfmul.d VX3, x2, VXS
+    vfsub.d VX3, VXZ, VX3
+    vilvl.d x1, VX2 ,VX0
+    vilvh.d x2, VX2, VX0
+    vilvl.d x3, VX3 ,VX1
+    vilvh.d x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 2 * SIZE
+    vst x4, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmul.s VX0, x3, VXS
+    vfmul.s VX1, x1, VXS
+    vfsub.s VX1, VXZ, VX1
+    vfmul.s VX2, x4, VXS
+    vfmul.s VX3, x2, VXS
+    vfsub.s VX3, VXZ, VX3
+    vilvl.w x1, VX2 ,VX0
+    vilvh.w x2, VX2, VX0
+    vilvl.w x3, VX3 ,VX1
+    vilvh.w x4, VX3, VX1
+    vst x1, X, 0 * SIZE
+    vst x3, Y, 0 * SIZE
+    vst x2, X, 4 * SIZE
+    vst x4, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, Y, 2 * SIZE
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    srai.d I, N, 2
+#endif
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmadd.d VX0, x3, VXS, VX0
+    vfmul.d VX1, x1, VXS
+    vfmsub.d VX1, x3, VXC, VX1
+    vfmul.d VX2, x2, VXC
+    vfmadd.d VX2, x4, VXS, VX2
+    vfmul.d VX3, x2, VXS
+    vfmsub.d VX3, x4, VXC, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmadd.d VX0, x3, VXS, VX0
+    vfmul.d VX1, x1, VXS
+    vfmsub.d VX1, x3, VXC, VX1
+    vfmul.d VX2, x2, VXC
+    vfmadd.d VX2, x4, VXS, VX2
+    vfmul.d VX3, x2, VXS
+    vfmsub.d VX3, x4, VXC, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L995
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+
+    vfmul.s VX0, x1, VXC
+    vfmadd.s VX0, x3, VXS, VX0
+    vfmul.s VX1, x1, VXS
+    vfmsub.s VX1, x3, VXC, VX1
+    vfmul.s VX2, x2, VXC
+    vfmadd.s VX2, x4, VXS, VX2
+    vfmul.s VX3, x2, VXS
+    vfmsub.s VX3, x4, VXC, VX3
+    vstelm.w VX0, XX, 0, 0
+    vstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 1
+    vstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 2
+    vstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 3
+    vstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    vstelm.w VX1, YY, 0, 0
+    vstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    vstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    vstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    vstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+#endif
+    .align 3
+
+.L222: // C!=0 S==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmul.d VX1, x3, VXC
+    vfmul.d VX2, x2, VXC
+    vfmul.d VX3, x4, VXC
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x1, VXC
+    vfmul.d VX1, x3, VXC
+    vfmul.d VX2, x2, VXC
+    vfmul.d VX3, x4, VXC
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L995
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, x1, VXC
+    vfmul.s VX1, x3, VXC
+    vfmul.s VX2, x2, VXC
+    vfmul.s VX3, x4, VXC
+    vstelm.w VX0, XX, 0, 0
+    vstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 1
+    vstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 2
+    vstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 3
+    vstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    vstelm.w VX1, YY, 0, 0
+    vstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    vstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    vstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    vstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+#endif
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x3, VXS
+    vfmul.d VX1, x1, VXS
+    vfsub.d VX1, VXZ, VX1
+    vfmul.d VX2, x4, VXS
+    vfmul.d VX3, x2, VXS
+    vfsub.d VX3, VXZ, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vfmul.d VX0, x3, VXS
+    vfmul.d VX1, x1, VXS
+    vfsub.d VX1, VXZ, VX1
+    vfmul.d VX2, x4, VXS
+    vfmul.d VX3, x2, VXS
+    vfsub.d VX3, VXZ, VX3
+    vstelm.d VX0, XX, 0, 0
+    vstelm.d VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VX0, XX, 0, 1
+    vstelm.d VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.d VX1, YY, 0, 0
+    vstelm.d VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VX1, YY, 0, 1
+    vstelm.d VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L995
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, x3, VXS
+    vfmul.s VX1, x1, VXS
+    vfsub.s VX1, VXZ, VX1
+    vfmul.s VX2, x4, VXS
+    vfmul.s VX3, x2, VXS
+    vfsub.s VX3, VXZ, VX3
+    vstelm.w VX0, XX, 0, 0
+    vstelm.w VX2, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 1
+    vstelm.w VX2, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 2
+    vstelm.w VX2, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w VX0, XX, 0, 3
+    vstelm.w VX2, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    vstelm.w VX1, YY, 0, 0
+    vstelm.w VX3, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    vstelm.w VX3, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    vstelm.w VX3, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+    vstelm.w VX3, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+#endif
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    vstelm.d VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    vstelm.d VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    move X, XX
+    move Y, YY
+    b .L995
+#else
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 0
+    vstelm.w VXZ, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    vstelm.w VXZ, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+    move X, XX
+    move Y, YY
+    b .L997
+#endif
+    .align 3
+
+#ifdef DOUBLE
+    .L995:
+    andi I, N, 3
+    bge $r0, I, .L999
+    b .L998
+    .align 3
+
+#endif
+.L996:
+    move I, N
+    b .L998
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 1
+#else
+    andi I, N, 3
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MUL  s1, a1, C
+    MADD s1, a3, S, s1
+    MUL  s2, a1, S
+    MSUB s2, a3, C, s2
+    MUL  s3, a2, C
+    MADD s3, a4, S, s3
+    MUL  s4, a2, S
+    MSUB s4, a4, C, s4
+    addi.d I, I, -1
+    ST   s1, X, 0 * SIZE
+    ST   s2, Y, 0 * SIZE
+    ST   s3, X, 1 * SIZE
+    ST   s4, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cscal_lasx.S b/kernel/loongarch64/cscal_lasx.S
new file mode 100644
index 000000000..f53526663
--- /dev/null
+++ b/kernel/loongarch64/cscal_lasx.S
@@ -0,0 +1,500 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define VXZ    $xr12
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT    a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    movfr2gr.d t2, ALPHAI
+    xvreplgr2vr.d VXAI, t2
+    xvxor.v VXZ, VXZ, VXZ
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXAR, t1
+    movfr2gr.s t2, ALPHAI
+    xvreplgr2vr.w VXAI, t2
+    xvxor.v VXZ, VXZ, VXZ
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L113 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L14:
+    bceqz $fcc1, .L114  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L111 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L111:  //alpha_r == 0.0 && alpha_i == 0.0
+    xvst VXZ, X, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvst VXZ, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L113: //alpha_r != 0.0 && alpha_i == 0.0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VXAR, x1
+    xvfmul.d x4, VXAR, x2
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VXAR, x1
+    xvfmul.s x4, VXAR, x2
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114:  //alpha_r != 0.0 && alpha_i != 0.0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d VX0, VXAI, x2
+    xvfmsub.d x3, VXAR, x1, VX0
+    xvfmul.d VX1, VXAI, x1
+    xvfmadd.d x4, VXAR, x2, VX1
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s VX0, VXAI, x2
+    xvfmsub.s x3, VXAR, x1, VX0
+    xvfmul.s VX1, VXAI, x1
+    xvfmadd.s x4, VXAR, x2, VX1
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L223 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L24:
+    bceqz $fcc1, .L224  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L221 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L221:  //alpha_r == 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.d VXZ, X, 0, 0
+    xvstelm.d VXZ, X, 1 * SIZE, 0
+#else
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    xvstelm.w VXZ, X, 0, 0
+    xvstelm.w VXZ, X, 1 * SIZE, 0
+#endif
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L223: //alpha_r != 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvfmul.d x3, VXAR, x1
+    xvfmul.d x4, VXAR, x2
+    addi.d  I, I, -1
+    xvstelm.d x3, XX, 0 * SIZE, 0
+    xvstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 1
+    xvstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 2
+    xvstelm.d x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 3
+    xvstelm.d x4, XX, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvfmul.s x3, VXAR, x1
+    xvfmul.s x4, VXAR, x2
+    addi.d  I, I, -1
+    xvstelm.w x3, XX, 0 * SIZE, 0
+    xvstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 1
+    xvstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 2
+    xvstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 3
+    xvstelm.w x4, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 4
+    xvstelm.w x4, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 5
+    xvstelm.w x4, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 6
+    xvstelm.w x4, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 7
+    xvstelm.w x4, XX, 1 * SIZE, 7
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224:  //alpha_r != 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+
+    xvfmul.d VX0, VXAI, x2
+    xvfmsub.d x3, VXAR, x1, VX0
+    xvfmul.d VX1, VXAI, x1
+    xvfmadd.d x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    xvstelm.d x3, XX, 0 * SIZE, 0
+    xvstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 1
+    xvstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 2
+    xvstelm.d x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.d x3, XX, 0 * SIZE, 3
+    xvstelm.d x4, XX, 1 * SIZE, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    add.d X, X, INCX
+
+    xvfmul.s VX0, VXAI, x2
+    xvfmsub.s x3, VXAR, x1, VX0
+    xvfmul.s VX1, VXAI, x1
+    xvfmadd.s x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    xvstelm.w x3, XX, 0 * SIZE, 0
+    xvstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 1
+    xvstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 2
+    xvstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 3
+    xvstelm.w x4, XX, 1 * SIZE, 3
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 4
+    xvstelm.w x4, XX, 1 * SIZE, 4
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 5
+    xvstelm.w x4, XX, 1 * SIZE, 5
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 6
+    xvstelm.w x4, XX, 1 * SIZE, 6
+    add.d XX, XX, INCX
+    xvstelm.w x3, XX, 0 * SIZE, 7
+    xvstelm.w x4, XX, 1 * SIZE, 7
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    addi.d I, I, -1
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s1, ALPHAR, a1, s1
+    MADD s2, ALPHAR, a2, s2
+    ST   s1, X, 0 * SIZE
+    ST   s2, X, 1 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cscal_lsx.S b/kernel/loongarch64/cscal_lsx.S
new file mode 100644
index 000000000..241d3d16e
--- /dev/null
+++ b/kernel/loongarch64/cscal_lsx.S
@@ -0,0 +1,445 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define VXZ    $vr12
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT    a1, a1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+#ifdef DOUBLE
+    vreplgr2vr.d VXAR, t1
+    movfr2gr.d t2, ALPHAI
+    vreplgr2vr.d VXAI, t2
+#else
+    vreplgr2vr.w VXAR, t1
+    movfr2gr.s t2, ALPHAI
+    vreplgr2vr.w VXAI, t2
+#endif
+    vxor.v VXZ, VXZ, VXZ
+    srai.d I, N, 2
+    bne INCX, TEMP, .L22
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L13
+    b .L14
+    .align 3
+
+.L13:
+    bceqz $fcc1, .L114 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L113 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L14:
+    bceqz $fcc1, .L114  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L111 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L111:  //alpha_r == 0.0 && alpha_i == 0.0
+    vst VXZ, X, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, X, 6 * SIZE
+#else
+    vst VXZ, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L113: //alpha_r != 0.0 && alpha_i == 0.0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VXAR, x1
+    vfmul.s x4, VXAR, x2
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114:  //alpha_r != 0.0 && alpha_i != 0.0
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s VX0, VXAI, x2
+    vfmsub.s x3, VXAR, x1, VX0
+    vfmul.s VX1, VXAI, x1
+    vfmadd.s x4, VXAR, x2, VX1
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L23
+    b .L24
+    .align 3
+
+.L23:
+    bceqz $fcc1, .L224 //alpha_r != 0.0 && alpha_i != 0.0
+    b .L223 //alpha_r != 0.0 && alpha_i == 0.0
+
+.L24:
+    bceqz $fcc1, .L224  //alpha_r == 0.0 && alpha_i != 0.0
+    b .L221 //alpha_r == 0.0 && alpha_i == 0.0
+    .align 3
+
+.L221:  //alpha_r == 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.d VXZ, X, 0, 0
+    vstelm.d VXZ, X, 1 * SIZE, 0
+#else
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+    add.d X, X, INCX
+    vstelm.w VXZ, X, 0, 0
+    vstelm.w VXZ, X, 1 * SIZE, 0
+#endif
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L223: //alpha_r != 0.0 && alpha_i == 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d x3, VXAR, x1
+    vfmul.d x4, VXAR, x2
+    addi.d  I, I, -1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vfmul.s x3, VXAR, x1
+    vfmul.s x4, VXAR, x2
+    addi.d  I, I, -1
+    vstelm.w x3, XX, 0 * SIZE, 0
+    vstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 1
+    vstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 2
+    vstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 3
+    vstelm.w x4, XX, 1 * SIZE, 3
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224:  //alpha_r != 0.0 && alpha_i != 0.0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX0, VXAI, x2
+    vfmsub.d x3, VXAR, x1, VX0
+    vfmul.d VX1, VXAI, x1
+    vfmadd.d x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    vstelm.d x3, XX, 0 * SIZE, 0
+    vstelm.d x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.d x3, XX, 0 * SIZE, 1
+    vstelm.d x4, XX, 1 * SIZE, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+
+    vfmul.s VX0, VXAI, x2
+    vfmsub.s x3, VXAR, x1, VX0
+    vfmul.s VX1, VXAI, x1
+    vfmadd.s x4, VXAR, x2, VX1
+    addi.d  I, I, -1
+    vstelm.w x3, XX, 0 * SIZE, 0
+    vstelm.w x4, XX, 1 * SIZE, 0
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 1
+    vstelm.w x4, XX, 1 * SIZE, 1
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 2
+    vstelm.w x4, XX, 1 * SIZE, 2
+    add.d XX, XX, INCX
+    vstelm.w x3, XX, 0 * SIZE, 3
+    vstelm.w x4, XX, 1 * SIZE, 3
+#endif
+    add.d XX, XX, INCX
+    blt $r0, I, .L224
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    addi.d I, I, -1
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s1, ALPHAR, a1, s1
+    MADD s2, ALPHAR, a2, s2
+    ST   s1, X, 0 * SIZE
+    ST   s2, X, 1 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/csum_lasx.S b/kernel/loongarch64/csum_lasx.S
new file mode 100644
index 000000000..3e65f2c15
--- /dev/null
+++ b/kernel/loongarch64/csum_lasx.S
@@ -0,0 +1,274 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvfadd.d res2, VX2, VX3
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/csum_lsx.S b/kernel/loongarch64/csum_lsx.S
new file mode 100644
index 000000000..8de8e27ca
--- /dev/null
+++ b/kernel/loongarch64/csum_lsx.S
@@ -0,0 +1,266 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfadd.s res2, VX0, VX1
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+
+    addi.d  I, I, -1
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfadd.s res2, VX0, VX1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    ADD a1, a1, a2
+    ADD s1, a1, s1
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cswap_lasx.S b/kernel/loongarch64/cswap_lasx.S
new file mode 100644
index 000000000..d53773d5a
--- /dev/null
+++ b/kernel/loongarch64/cswap_lasx.S
@@ -0,0 +1,394 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvst VX2, X, 0 * SIZE
+    xvst VX3, X, 4 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvst VX2, X, 0 * SIZE
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+.L113:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    xvstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 2
+    ld.d t4, Y, 1 * SIZE
+    xvstelm.d VX0, Y, 1 * SIZE, 3
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvst VX2, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    xvstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    xvstelm.d VX1, Y, 0 * SIZE, 2
+    ld.d t4, Y, 1 * SIZE
+    xvstelm.d VX1, Y, 1 * SIZE, 3
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvst VX3, X, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 0
+    ld.w t2, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 2
+    ld.w t4, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 3
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 4
+    ld.w t2, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 6
+    ld.w t4, Y, 1 * SIZE
+    xvstelm.w VX0, Y, 1 * SIZE, 7
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvst VX2, X, 0 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    xvstelm.d VX2, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    xvstelm.d VX2, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    xvstelm.d VX2, X, 0 * SIZE, 2
+    ld.d t4, X, 1 * SIZE
+    xvstelm.d VX2, X, 1 * SIZE, 3
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvst VX0, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    xvstelm.d VX3, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    xvstelm.d VX3, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    xvstelm.d VX3, X, 0 * SIZE, 2
+    ld.d t4, X, 1 * SIZE
+    xvstelm.d VX3, X, 1 * SIZE, 3
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvst VX1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 0
+    ld.w t2, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 2
+    ld.w t4, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 3
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 4
+    ld.w t2, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 5
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    xvstelm.w VX2, X, 0 * SIZE, 6
+    ld.w t4, X, 1 * SIZE
+    xvstelm.w VX2, X, 1 * SIZE, 7
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvst VX0, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD a3, X, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD a1, X, 0 * SIZE
+    ST b1, XX, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    LD a3, X, 0 * SIZE
+    ST b3, XX, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    ST b1, XX, 0 * SIZE
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ST b3, XX, 0 * SIZE
+    ST b4, XX, 1 * SIZE
+
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/cswap_lsx.S b/kernel/loongarch64/cswap_lsx.S
new file mode 100644
index 000000000..62a869066
--- /dev/null
+++ b/kernel/loongarch64/cswap_lsx.S
@@ -0,0 +1,421 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 2 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vst VX2, X, 4 * SIZE
+    vst VX3, X, 6 * SIZE
+    vst VX0, Y, 4 * SIZE
+    vst VX1, Y, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vst VX2, X, 0 * SIZE
+    vst VX3, X, 4 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    ld.d t4, Y, 1 * SIZE
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    ld.d t2, Y, 1 * SIZE
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    ld.d t4, Y, 1 * SIZE
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 0
+    ld.w t2, Y, 1 * SIZE
+    vstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 2
+    ld.w t4, Y, 1 * SIZE
+    vstelm.w VX0, Y, 1 * SIZE, 3
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vst VX2, X, 0 * SIZE
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0 * SIZE, 0
+    ld.w t2, Y, 1 * SIZE
+    vstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    vstelm.w VX1, Y, 0 * SIZE, 2
+    ld.w t4, Y, 1 * SIZE
+    vstelm.w VX1, Y, 1 * SIZE, 3
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vst VX3, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    vstelm.d VX2, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    vstelm.d VX2, X, 1 * SIZE, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    vstelm.d VX3, X, 0 * SIZE, 0
+    ld.d t4, X, 1 * SIZE
+    vstelm.d VX3, X, 1 * SIZE, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    vstelm.d VX2, X, 0 * SIZE, 0
+    ld.d t2, X, 1 * SIZE
+    vstelm.d VX2, X, 1 * SIZE, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    vstelm.d VX3, X, 0 * SIZE, 0
+    ld.d t4, X, 1 * SIZE
+    vstelm.d VX3, X, 1 * SIZE, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    vstelm.w VX2, X, 0 * SIZE, 0
+    ld.w t2, X, 1 * SIZE
+    vstelm.w VX2, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    vstelm.w VX2, X, 0 * SIZE, 2
+    ld.w t4, X, 1 * SIZE
+    vstelm.w VX2, X, 1 * SIZE, 3
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vst VX0, Y, 0 * SIZE
+
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    vstelm.w VX3, X, 0 * SIZE, 0
+    ld.w t2, X, 1 * SIZE
+    vstelm.w VX3, X, 1 * SIZE, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    vstelm.w VX3, X, 0 * SIZE, 2
+    ld.w t4, X, 1 * SIZE
+    vstelm.w VX3, X, 1 * SIZE, 3
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vst VX1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD a3, X, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD a1, X, 0 * SIZE
+    ST b1, XX, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    LD a3, X, 0 * SIZE
+    ST b3, XX, 0 * SIZE
+    LD a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST b4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+
+    LD b1, Y, 0 * SIZE
+    ST a1, Y, 0 * SIZE
+    LD b2, Y, 1 * SIZE
+    ST a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    LD b3, Y, 0 * SIZE
+    ST a3, Y, 0 * SIZE
+    LD b4, Y, 1 * SIZE
+    ST a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    ST b1, XX, 0 * SIZE
+    ST b2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ST b3, XX, 0 * SIZE
+    ST b4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD a1, X, 0 * SIZE
+    LD a2, X, 1 * SIZE
+    LD a3, Y, 0 * SIZE
+    LD a4, Y, 1 * SIZE
+    ST a1, Y, 0 * SIZE
+    ST a2, Y, 1 * SIZE
+    ST a3, X, 0 * SIZE
+    ST a4, X, 1 * SIZE
+
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S
index 13faa977e..f8e26fda2 100644
--- a/kernel/loongarch64/dgemm_kernel_16x4.S
+++ b/kernel/loongarch64/dgemm_kernel_16x4.S
@@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
+/*********************************************************************
+* 2023/06/28 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2023/06/28 guxiwei
+* Parameter:
+*       DGEMM_DEFAULT_UNROLL_N  4
+*       DGEMM_DEFAULT_UNROLL_M  16
+*       DGEMM_DEFAULT_P         32
+*       DGEMM_DEFAULT_Q         152
+*       DGEMM_DEFAULT_R         858
+*       A_PR1                   1024
+*       B_PR1                   256
+*
+*
+* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
+*       1 thread:       36.0 GFLOPS
+*       2 threads:      71.6 GFLOPS
+*       3 threads:     101.5 GFLOPS
+*       4 threads:     132.8 GFLOPS
+*********************************************************************/
+
 /* Function parameters */
 #define M      $r4   // param 1: bm
 #define N      $r5   // param 2: bn
@@ -68,1290 +93,1331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define U4     $xr4
 #define U5     $xr5
 #define U6     $xr6
-#define D0     $xr7
-#define D1     $xr8
-#define D2     $xr9
-#define D3     $xr10
-#define D4     $xr11
-#define D5     $xr12
-#define D6     $xr13
-#define D7     $xr14
-#define D8     $xr15
-#define D9     $xr16
-#define D10    $xr17
-#define D11    $xr18
-#define D12    $xr19
-#define D13    $xr20
-#define D14    $xr21
-#define D15    $xr22
-#define VALPHA $xr23
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define VALPHA $xr15
 
 /* Prefetch interval */
-#define A_PRE  0x200
+#define A_PRE  0x400
 #define B_PRE  0x100
 
-    PROLOGUE
-
-    addi.d   $sp,   $sp,   -56
-    /* Store regs */
-    SDARG    $r23,  $sp,   0
-    SDARG    $r24,  $sp,   8
-    SDARG    $r25,  $sp,   16
-    SDARG    $r26,  $sp,   24
-    SDARG    $r27,  $sp,   32
-    ST       $f23,  $sp,   40
-    ST       ALPHA, $sp,   48
-
-    /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */
-    xvld         VALPHA, $sp,  48
-    xvreplve0.d  VALPHA, VALPHA
-
-#if defined (TRMMKERNEL) && !defined(LEFT)
-    sub.d   OFF,   ZERO,  OFFSET
-#else
-    xor     OFF,   OFF,   OFF
-#endif
-
-    /* if (!(N >> 2)) goto L_N3 */
-    srai.d   J,     N,     2     /* J = bn >> 2 */
-    andi     N,     N,     0x03
-    beq      ZERO,  J,     .L_N3
-
-.L_J1: /* J-- && This loop include Condition 1 */
-
-/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
-*                                                   dgemm_core_16x4                                */
-    move     C0,    C
-    move     A0,    A
-    slli.d   T0,    LDC,   3
-    add.d    C1,    C0,    T0
-    addi.d   J,     J,     -1   /* J-- */
-    add.d    C2,    C1,    T0
-    add.d    C3,    C2,    T0
-
-#if defined(TRMMKERNEL) && defined(LEFT)
-    move     OFF,   OFFSET
-#endif
-
-    /* if (!(M >> 4)) goto L_M8 */
-    srai.d   I,     M,     4     /* I = bm >> 4 */
-    beq      ZERO,  I,     .L_M8
-
-.L_I1: /* I-- */
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x07
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x05
-    add.d    B0,    B,    T0
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  16
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  4
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-    /* Calculate the first set of D0~D15,
-     * avoidig set 0 operation
-     * Load 16 * 64 from A0
-     * U0 = {a3,  a2,  a1,  a0}
-     * U1 = {a7,  a6,  a5,  a4}
-     * U2 = {a11, a10, a9,  a8}
-     * U3 = {a15, a14, a13, a12}
-     */
+.macro KERNEL2x16x4
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d     U4, B0, 0x00
-    preld          0,  C0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
-    preld     0,   C0, 0x40
-    xvfmul.d  D2,  U2, U4
-    xvfmul.d  D3,  U3, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    preld          0,   C1,    0x00
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
-    preld     0,   C1,    0x40
-    xvfmul.d  D6,  U2, U4
-    xvfmul.d  D7,  U3, U4
-
-    xvldrepl.d     U4, B0, 0x10
-    preld          0,   C2,    0x00
-    /* line 3 */
-    xvfmul.d  D8,  U0, U4
-    xvfmul.d  D9,  U1, U4
-    preld     0,   C2,    0x40
-    xvfmul.d  D10, U2, U4
-    xvfmul.d  D11, U3, U4
-
-    xvldrepl.d     U4, B0, 0x18
-    preld          0,   C3,    0x00
-    /* line 4 */
-    xvfmul.d  D12, U0, U4
-    xvfmul.d  D13, U1, U4
-    preld     0,   C3,    0x40
-    xvfmul.d  D14, U2, U4
-    xvfmul.d  D15, U3, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x80
-    addi.d    B0,  B0, 0x20
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_L7 */
-    beq       ZERO,TL, .L_L7
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
 
-    /* Calculate 8 sets of D0~D15 */
-.L_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
     xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
     xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
     preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
 
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
-           /***8-2***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
     xvfmadd.d  D2,  U2, U4, D2
     xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U12,  B0, 0x00
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
     preld      0,   B0, B_PRE
+    xvldrepl.d U13,  B0, 0x08
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvldrepl.d U14,  B0, 0x10
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
 
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
+.endm
 
-           /***8-3***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL2x16x4_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
     xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
     xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
     preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
 
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
-           /***8-4***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
+
     xvfmadd.d  D2,  U2, U4, D2
     xvfmadd.d  D3,  U3, U4, D3
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
     preld      0,   B0, B_PRE
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
     preld      0,   A0, A_PRE
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
     preld      0,   A0, A_PRE + 0x40
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+.macro KERNEL8x16x4
+.rept 4
+    KERNEL2x16x4
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x16x4_END
+.rept 3
+    KERNEL2x16x4
+.endr
+    KERNEL2x16x4_END
+.endm
 
-           /***8-5***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL2x8x4
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
 
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
 
-           /***8-6***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U12, B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
+.endm
 
-           /***8-7***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL2x8x4_END
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+.endm
 
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x8x4
+.rept 4
+    KERNEL2x8x4
+.endr
+.endm
 
-           /***8-8***/
-    /* Load 16 * 64 from A0 */
+.macro KERNEL8x8x4_END
+.rept 3
+    KERNEL2x8x4
+.endr
+    KERNEL2x8x4_END
+.endm
+
+.macro KERNEL2x4x4
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-    preld      0,   B0, B_PRE
+    xvfmadd.d  D0,  U8, U12, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-    preld      0,   A0, A_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
-    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
 
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x20
 
-    addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_TL1
+    xvld     U8,   A0,    0x00
 
-   /* Maybe we need calculate the last
-    * 7 sets of D0~D15?
-    */
-.L_L7:
-    /* if (!(L & 7)) goto L_L0 */
-    andi      TL,  L,   7
-    beq       TL,  ZERO,.L_L0
+    xvldrepl.d U12, B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
 
-.L_L71:
-    /* Load 16 * 64 from A0 */
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x4_END
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
 
-    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvfmadd.d  D0,  U8, U12, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
-    xvfmadd.d  D10, U2, U4, D10
-    xvfmadd.d  D11, U3, U4, D11
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
-    xvfmadd.d  D14, U2, U4, D14
-    xvfmadd.d  D15, U3, U4, D15
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
 
-    /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x80
+    addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x20
 
-    addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_L71
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D12, U0, U7, D12
+.endm
 
-.L_L0:
-#if defined(TRMMKERNEL)
-    xvfmul.d  D0,   D0,  VALPHA
-    xvfmul.d  D1,   D1,  VALPHA
-    xvfmul.d  D2,   D2,  VALPHA
-    xvfmul.d  D3,   D3,  VALPHA
-    xvfmul.d  D4,   D4,  VALPHA
-    xvfmul.d  D5,   D5,  VALPHA
-    xvfmul.d  D6,   D6,  VALPHA
-    xvfmul.d  D7,   D7,  VALPHA
-    xvfmul.d  D8,   D8,  VALPHA
-    xvfmul.d  D9,   D9,  VALPHA
-    xvfmul.d  D10,  D10, VALPHA
-    xvfmul.d  D11,  D11, VALPHA
-    xvfmul.d  D12,  D12, VALPHA
-    xvfmul.d  D13,  D13, VALPHA
-    xvfmul.d  D14,  D14, VALPHA
-    xvfmul.d  D15,  D15, VALPHA
-#else
-    /* Load C0  */
-    xvld      U0,  C0,  0x00
-    xvld      U1,  C0,  0x20
-    xvld      U2,  C0,  0x40
-    xvld      U3,  C0,  0x60
-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
-    xvfmadd.d D1,  D1,  VALPHA,  U1
-    xvfmadd.d D2,  D2,  VALPHA,  U2
-    xvfmadd.d D3,  D3,  VALPHA,  U3
+.macro KERNEL8x4x4
+.rept 4
+    KERNEL2x4x4
+.endr
+.endm
 
-    /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvld      U2,  C1,  0x40
-    xvld      U3,  C1,  0x60
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
-    xvfmadd.d D6,  D6,  VALPHA,  U2
-    xvfmadd.d D7,  D7,  VALPHA,  U3
+.macro KERNEL8x4x4_END
+.rept 3
+    KERNEL2x4x4
+.endr
+    KERNEL2x4x4_END
+.endm
 
-    /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvld      U1,  C2,  0x20
-    xvld      U2,  C2,  0x40
-    xvld      U3,  C2,  0x60
-    xvfmadd.d D8,  D8,  VALPHA,  U0
-    xvfmadd.d D9,  D9,  VALPHA,  U1
-    xvfmadd.d D10, D10, VALPHA,  U2
-    xvfmadd.d D11, D11, VALPHA,  U3
+.macro KERNEL2x2x4
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
 
-    /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvld      U1,  C3,  0x20
-    xvld      U2,  C3,  0x40
-    xvld      U3,  C3,  0x60
-    xvfmadd.d D12, D12, VALPHA,  U0
-    xvfmadd.d D13, D13, VALPHA,  U1
-    xvfmadd.d D14, D14, VALPHA,  U2
-    xvfmadd.d D15, D15, VALPHA,  U3
-#endif // #if defined(TRMMKERNEL)
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
 
-    /* Store C0 */
-    xvst      D0,  C0,  0x00
-    xvst      D1,  C0,  0x20
-    xvst      D2,  C0,  0x40
-    xvst      D3,  C0,  0x60
-    /* Store C1 */
-    xvst      D4,  C1,  0x00
-    xvst      D5,  C1,  0x20
-    xvst      D6,  C1,  0x40
-    xvst      D7,  C1,  0x60
-    /* Store C2 */
-    xvst      D8,  C2,  0x00
-    xvst      D9,  C2,  0x20
-    xvst      D10, C2,  0x40
-    xvst      D11, C2,  0x60
-    /* Store C3 */
-    xvst      D12, C3,  0x00
-    xvst      D13, C3,  0x20
-    xvst      D14, C3,  0x40
-    xvst      D15, C3,  0x60
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
 
-    /* Add stride for C */
-    addi.d    C0,  C0,  0x80
-    addi.d    C1,  C1,  0x80
-    addi.d    C2,  C2,  0x80
-    addi.d    C3,  C3,  0x80
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    sub.d     L,   K,   OFF
-#ifdef LEFT
-    /* number of values  in A */
-    addi.d    L,   L,   -16
-#else
-    /* number of values in B */
-    addi.d    L,   L,   -4
-#endif
-    slli.d    T0,  L,  0x07
-    add.d     A0,  A0, T0
-    slli.d    T0,  L,  0x05
-    add.d     B0,  B0, T0
-#endif
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
 
-#ifdef LEFT
-    addi.d    OFF, OFF, 0x10
-#endif
-#endif   // #if defined(TRMMKERNEL)
+    xvld       U12, B0, 0x00
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+.endm
 
-    addi.d    I,   I,   -1  /* I-- */
-    blt       ZERO,I,   .L_I1
+.macro KERNEL2x2x4_END
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
 
-.L_M8:
-    /* We have done M & 16, considering M=8/4/2/1 */
-    andi      I,   M,   15
-    beq       ZERO,I,   .L_M0
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
 
-    andi      I,   M,   8
-    beq       ZERO,I,   .L_M4
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x06
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x05
-    add.d    B0,    B,    T0
-#endif
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+.endm
 
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  8
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  4
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif  // #if defined(TRMMKERNEL)
+.macro KERNEL8x2x4
+.rept 4
+    KERNEL2x2x4
+.endr
+.endm
 
-    /* Load 8 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+.macro KERNEL8x2x4_END
+.rept 3
+    KERNEL2x2x4
+.endr
+    KERNEL2x2x4_END
+.endm
 
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
+.macro KERNEL2x1x4
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
 
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
 
-    xvldrepl.d     U4, B0, 0x10
-    /* line 3 */
-    xvfmul.d  D8,  U0, U4
-    xvfmul.d  D9,  U1, U4
+    xvldrepl.d  U8,     A0,     0x00
+    xvfmadd.d   D0,     U0,     U4,     D0
+    xvld        U12,    B0,     0x00
 
-    xvldrepl.d     U4, B0, 0x18
-    /* line 4 */
-    xvfmul.d  D12, U0, U4
-    xvfmul.d  D13, U1, U4
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+.endm
 
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x40
-    addi.d    B0,  B0, 0x20
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M8_L7 */
-    beq       ZERO,TL, .L_M8_L7
+.macro KERNEL2x1x4_END
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
 
-.L_M8_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d   D0,     U0,     U4,     D0
+.endm
+
+.macro KERNEL8x1x4
+.rept 4
+    KERNEL2x1x4
+.endr
+.endm
+
+.macro KERNEL8x1x4_END
+.rept 3
+    KERNEL2x1x4
+.endr
+    KERNEL2x1x4_END
+.endm
+
+.macro KERNEL2x16x2
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+.endm
 
-           /***8-2***/
+.macro KERNEL2x16x2_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+.endm
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x16x2
+.rept 4
+    KERNEL2x16x2
+.endr
+.endm
+
+.macro KERNEL8x16x2_END
+.rept 3
+    KERNEL2x16x2
+.endr
+    KERNEL2x16x2_END
+.endm
 
-           /***8-3***/
+.macro KERNEL2x8x2
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
     addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    addi.d     B0,  B0, 0x10
+.endm
 
-           /***8-4***/
+.macro KERNEL2x8x2_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+.endm
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+.macro KERNEL8x8x2
+.rept 4
+    KERNEL2x8x2
+.endr
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL8x8x2_END
+.rept 3
+    KERNEL2x8x2
+ .endr
+    KERNEL2x8x2_END
+.endm
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
-
-           /***8-5***/
+.macro KERNEL2x4x2
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvldrepl.d U5,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+.endm
 
-           /***8-6***/
+.macro KERNEL2x4x2_END
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvldrepl.d U5,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL8x4x2
+.rept 4
+    KERNEL2x4x2
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x4x2_END
+.rept 3
+    KERNEL2x4x2
+.endr
+    KERNEL2x4x2_END
+.endm
 
-           /***8-7***/
+.macro KERNEL2x2x2
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D4,  U0, U5, D4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL2x2x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
 
-           /***8-8***/
+.macro KERNEL8x2x2
+.rept 4
+    KERNEL2x2x2
+.endr
+.endm
+
+.macro KERNEL8x2x2_END
+.rept 3
+    KERNEL2x2x2
+.endr
+    KERNEL2x2x2_END
+.endm
+
+.macro KERNEL2x1x2
     xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
     xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D4,  U0, U5, D4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+.macro KERNEL2x1x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
 
-    addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_M8_TL1
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
 
-.L_M8_L7:
-    /* if (!(L & 7)) goto L_M8_L0 */
-    andi      TL,  L,   7
-    beq       TL,  ZERO,.L_M8_L0
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
 
-.L_M8_L71:
+.macro KERNEL8x1x2
+.rept 4
+    KERNEL2x1x2
+.endr
+.endm
+
+.macro KERNEL8x1x2_END
+.rept 3
+    KERNEL2x1x2
+.endr
+    KERNEL2x1x2_END
+.endm
+
+.macro KERNEL2x16x1
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
     xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
     xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-    xvfmadd.d  D9,  U1, U4, D9
+    xvld     U10,   A0,    0x40
+    xvld     U11,   A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-    xvfmadd.d  D13, U1, U4, D13
+    xvldrepl.d U12,  B0, 0x00
 
-    /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+.endm
 
-    addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_M8_L71
+.macro KERNEL2x16x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
 
-.L_M8_L0:
-#if defined(TRMMKERNEL)
-    xvfmul.d  D0,   D0,  VALPHA
-    xvfmul.d  D1,   D1,  VALPHA
-    xvfmul.d  D4,   D4,  VALPHA
-    xvfmul.d  D5,   D5,  VALPHA
-    xvfmul.d  D8,   D8,  VALPHA
-    xvfmul.d  D9,   D9,  VALPHA
-    xvfmul.d  D12,  D12, VALPHA
-    xvfmul.d  D13,  D13, VALPHA
-#else
-    /* Load C0  */
-    xvld      U0,  C0,  0x00
-    xvld      U1,  C0,  0x20
-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
-    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
 
-    /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
-    /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvld      U1,  C2,  0x20
-    xvfmadd.d D8,  D8,  VALPHA,  U0
-    xvfmadd.d D9,  D9,  VALPHA,  U1
+    xvldrepl.d U4,  B0, 0x00
 
-    /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvld      U1,  C3,  0x20
-    xvfmadd.d D12, D12, VALPHA,  U0
-    xvfmadd.d D13, D13, VALPHA,  U1
-#endif   // #if defined(TRMMKERNEL)
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
 
-    /* Store C0 */
-    xvst      D0,  C0,  0x00
-    xvst      D1,  C0,  0x20
-    /* Store C1 */
-    xvst      D4,  C1,  0x00
-    xvst      D5,  C1,  0x20
-    /* Store C2 */
-    xvst      D8,  C2,  0x00
-    xvst      D9,  C2,  0x20
-    /* Store C3 */
-    xvst      D12, C3,  0x00
-    xvst      D13, C3,  0x20
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
 
-    /* Add stride for C */
-    addi.d    C0,  C0,  0x40
-    addi.d    C1,  C1,  0x40
-    addi.d    C2,  C2,  0x40
-    addi.d    C3,  C3,  0x40
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+.endm
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    sub.d     L,   K,   OFF
-#ifdef LEFT
-    /* number of values in A */
-    addi.d    L,   L,   -8
-#else
-    /* number of values in B */
-    addi.d    L,   L,   -4
-#endif
-    slli.d    T0,  L,  0x06
-    add.d     A0,  A0, T0
-    slli.d    T0,  L,  0x05
-    add.d     B0,  B0, T0
-#endif
+.macro KERNEL8x16x1
+.rept 4
+    KERNEL2x16x1
+.endr
+.endm
 
-#ifdef LEFT
-    /* number of values in A */
-    addi.d    OFF,   OFF,  0x08
-#endif
-#endif   // #if defined(TRMMKERNEL)
+.macro KERNEL8x16x1_END
+.rept 3
+    KERNEL2x16x1
+.endr
+    KERNEL2x16x1_END
+.endm
 
-/********LOOP (if(N >> 2 ) && (M & 8)) End************/
+.macro KERNEL2x8x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
 
-.L_M4:
-    andi      I,   M,   4
-    beq       ZERO,I,   .L_M2
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
 
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x05
-    add.d    A0,    A0,   T0
-    add.d    B0,    B,    T0
-#endif
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvld     U9,   A0,    0x20
+    xvldrepl.d U12,  B0, 0x00
 
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  4
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  4
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+.endm
 
-    /* Load 4 * 64 from A0 */
+.macro KERNEL2x8x1_END
     xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
 
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d     U4, B0, 0x10
-    /* line 3 */
-    xvfmul.d  D8,  U0, U4
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+.endm
 
-    xvldrepl.d     U4, B0, 0x18
-    /* line 4 */
-    xvfmul.d  D12, U0, U4
+.macro KERNEL8x8x1
+.rept 4
+    KERNEL2x8x1
+.endr
+.endm
 
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x20
-    addi.d    B0,  B0, 0x20
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M4_L7 */
-    beq       ZERO,TL, .L_M4_L7
+.macro KERNEL8x8x1_END
+.rept 3
+    KERNEL2x8x1
+.endr
+    KERNEL2x8x1_END
+.endm
 
-.L_M4_TL1: /* TL-- */
-           /***8-1***/
+.macro KERNEL2x4x1
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
 
     addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    addi.d     B0,  B0, 0x08
+.endm
 
-           /***8-2***/
+.macro KERNEL2x4x1_END
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.macro KERNEL8x4x1
+.rept 4
+    KERNEL2x4x1
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x4x1_END
+.rept 3
+    KERNEL2x4x1
+.endr
+    KERNEL2x4x1_END
+.endm
 
-           /***8-3***/
+.macro KERNEL2x2x1
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+.endm
 
-           /***8-4***/
+.macro KERNEL2x2x1_END
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.macro KERNEL8x2x1
+.rept 4
+    KERNEL2x2x1
+.endr
+.endm
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+.macro KERNEL8x2x1_END
+.rept 3
+    KERNEL2x2x1
+.endr
+    KERNEL2x2x1_END
+.endm
 
-           /***8-5***/
+.macro KERNEL2x1x1
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+.endm
 
-           /***8-6***/
+.macro KERNEL2x1x1_END
     xvld     U0,   A0,    0x00
-
+    xvfmadd.d  D0,  U8, U12, D0
     xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
     xvfmadd.d  D0,  U0, U4, D0
+.endm
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+.macro KERNEL8x1x1
+.rept 4
+    KERNEL2x1x1
+.endr
+.endm
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+.macro KERNEL8x1x1_END
+.rept 3
+    KERNEL2x1x1
+.endr
+    KERNEL2x1x1_END
+.endm
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+    PROLOGUE
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    addi.d   $sp,   $sp,   -120
+    /* Store regs */
+    SDARG    $r23,  $sp,   0
+    SDARG    $r24,  $sp,   8
+    SDARG    $r25,  $sp,   16
+    SDARG    $r26,  $sp,   24
+    SDARG    $r27,  $sp,   32
+    ST       $f23,  $sp,   40
+    ST       $f24,  $sp,   48
+    ST       $f25,  $sp,   56
+    ST       $f26,  $sp,   64
+    ST       $f27,  $sp,   72
+    ST       $f28,  $sp,   80
+    ST       $f29,  $sp,   88
+    ST       $f30,  $sp,   96
+    ST       $f31,  $sp,   104
+    ST       ALPHA, $sp,   112
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* if (!(N >> 2)) goto L_N3 */
+    srai.d   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    xvldrepl.d  VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */
+    beq      ZERO,  J,     .L_N3
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+.L_J1: /* J-- && This loop include Condition 1 */
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+/************************* Condition 1 if((N >> 2) && (M >> 4)) START !!! *************************
+*                                                   dgemm_core_16x4                                */
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+    addi.d   J,     J,     -1   /* J-- */
+    add.d    C2,    C1,    T0
+    add.d    C3,    C2,    T0
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x20
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_M8 */
+    srai.d   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M8
 
-           /***8-8***/
+.L_I1: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x07
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  16
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    /* Calculate the first set of D0~D15,
+     * avoidig set 0 operation
+     * Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d     U4, B0, 0x00
+    preld          0,  C0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    preld     0,   C0, 0x40
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d     U5, B0, 0x08
+    preld          0,   C1,    0x00
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    preld     0,   C1,    0x40
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d     U6, B0, 0x10
+    preld          0,   C2,    0x00
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
+    preld     0,   C2,    0x40
+    xvfmul.d  D10, U2, U6
+    xvfmul.d  D11, U3, U6
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d     U7, B0, 0x18
+    preld          0,   C3,    0x00
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
+    preld     0,   C3,    0x40
+    xvfmul.d  D14, U2, U7
+    xvfmul.d  D15, U3, U7
 
-    addi.d     A0,  A0, 0x20
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    addi.d    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
+    beq    ZERO,    TL,  .L_TL1_END
+.L_TL1: /* TL-- */
+    KERNEL8x16x4
     addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_M4_TL1
+    blt       ZERO,TL, .L_TL1
 
-.L_M4_L7:
-    /* if (!(L & 7)) goto L_M4_L0 */
+.L_TL1_END:
+    KERNEL8x16x4_END
+
+   /* Maybe we need calculate the last
+    * 7 sets of D0~D15?
+    */
+.L_L7:
+    /* if (!(L & 7)) goto L_L0 */
     andi      TL,  L,   7
-    beq       TL,  ZERO,.L_M4_L0
+    beq       TL,  ZERO,.L_L0
 
-.L_M4_L71:
+.L_L71:
+    /* Load 16 * 64 from A0 */
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
 
+    /* Cumulative D0~D15 */
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
 
     /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x20
+    addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x20
 
     addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_M4_L71
+    blt        ZERO,TL, .L_L71
 
-.L_M4_L0:
+.L_L0:
+    xvldrepl.d  VALPHA, $sp, 112
 #if defined(TRMMKERNEL)
     xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
+    xvfmul.d  D2,   D2,  VALPHA
+    xvfmul.d  D3,   D3,  VALPHA
     xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
+    xvfmul.d  D6,   D6,  VALPHA
+    xvfmul.d  D7,   D7,  VALPHA
     xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
+    xvfmul.d  D10,  D10, VALPHA
+    xvfmul.d  D11,  D11, VALPHA
     xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
+    xvfmul.d  D14,  D14, VALPHA
+    xvfmul.d  D15,  D15, VALPHA
 #else
     /* Load C0  */
     xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
+    xvfmadd.d D2,  D2,  VALPHA,  U2
+    xvfmadd.d D3,  D3,  VALPHA,  U3
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U4
+    xvfmadd.d D5,  D5,  VALPHA,  U5
+    xvfmadd.d D6,  D6,  VALPHA,  U6
+    xvfmadd.d D7,  D7,  VALPHA,  U7
 
     /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvld      U8,  C2,  0x00
+    xvld      U9,  C2,  0x20
+    xvld      U10, C2,  0x40
+    xvld      U11, C2,  0x60
+    xvfmadd.d D8,  D8,  VALPHA,  U8
+    xvfmadd.d D9,  D9,  VALPHA,  U9
+    xvfmadd.d D10, D10, VALPHA,  U10
+    xvfmadd.d D11, D11, VALPHA,  U11
 
     /* Load C3  */
     xvld      U0,  C3,  0x00
+    xvld      U1,  C3,  0x20
+    xvld      U2,  C3,  0x40
+    xvld      U3,  C3,  0x60
     xvfmadd.d D12, D12, VALPHA,  U0
-#endif   // #if defined(TRMMKERNEL)
+    xvfmadd.d D13, D13, VALPHA,  U1
+    xvfmadd.d D14, D14, VALPHA,  U2
+    xvfmadd.d D15, D15, VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
     xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    xvst      D2,  C0,  0x40
+    xvst      D3,  C0,  0x60
     /* Store C1 */
     xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    xvst      D6,  C1,  0x40
+    xvst      D7,  C1,  0x60
     /* Store C2 */
     xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    xvst      D10, C2,  0x40
+    xvst      D11, C2,  0x60
     /* Store C3 */
     xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
+    xvst      D14, C3,  0x40
+    xvst      D15, C3,  0x60
 
     /* Add stride for C */
-    addi.d    C0,  C0,  0x20
-    addi.d    C1,  C1,  0x20
-    addi.d    C2,  C2,  0x20
-    addi.d    C3,  C3,  0x20
+    addi.d    C0,  C0,  0x80
+    addi.d    C1,  C1,  0x80
+    addi.d    C2,  C2,  0x80
+    addi.d    C3,  C3,  0x80
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     sub.d     L,   K,   OFF
 #ifdef LEFT
-    /* number of values in A */
-    addi.d    L,   L,   -4
+    /* number of values  in A */
+    addi.d    L,   L,   -16
 #else
     /* number of values in B */
     addi.d    L,   L,   -4
 #endif
-    slli.d    T0,  L,  0x05
+    slli.d    T0,  L,  0x07
     add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
     add.d     B0,  B0, T0
 #endif
 
 #ifdef LEFT
-    /* number of values in A */
-    addi.d    OFF,   OFF,  0x04
+    addi.d    OFF, OFF, 0x10
 #endif
 #endif   // #if defined(TRMMKERNEL)
 
-/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_I1
 
-.L_M2:
-    andi      I,   M,   2
-    beq       ZERO,I,   .L_M1
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_M4
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     move     B0,    B
 #else
-    slli.d   T0,    OFF,  0x04
+    slli.d   T0,    OFF,  0x06
     add.d    A0,    A0,   T0
     slli.d   T0,    OFF,  0x05
     add.d    B0,    B,    T0
@@ -1361,7 +1427,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     sub.d    L,     K,    OFF
 #elif defined(LEFT)
     /* number of values in A */
-    addi.d   L,     OFF,  2
+    addi.d   L,     OFF,  8
 #else
     /* number of values in B */
     addi.d   L,     OFF,  4
@@ -1369,262 +1435,163 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else   // #if !defined(TRMMKERNEL)
     move     B0,    B
     move     L,     K /* L = bk */
-#endif
+#endif  // #if defined(TRMMKERNEL)
 
-    /* Load 2 * 64 from A0 */
+    /* Load 8 * 64 from A0 */
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
 
     xvldrepl.d     U4, B0, 0x00
     /* line 1 */
     xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
 
-    xvldrepl.d     U4, B0, 0x08
+    xvldrepl.d     U5, B0, 0x08
     /* line 2 */
-    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
 
-    xvldrepl.d     U4, B0, 0x10
+    xvldrepl.d     U6, B0, 0x10
     /* line 3 */
-    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
 
-    xvldrepl.d     U4, B0, 0x18
+    xvldrepl.d     U7, B0, 0x18
     /* line 4 */
-    xvfmul.d  D12, U0, U4
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
 
     /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x10
+    addi.d    A0,  A0, 0x40
     addi.d    B0,  B0, 0x20
     /* Reduce L */
     addi.d    L,   L,  -1
     srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M2_L7 */
-    beq       ZERO,TL, .L_M2_L7
-
-.L_M2_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 2 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_M8_L7
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    addi.d    TL,  TL,  -1
 
-    addi.d     A0,  A0, 0x10
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    beq    ZERO,    TL,  .L_M8_TL1_END
 
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x20
+.L_M8_TL1: /* TL-- */
+    KERNEL8x8x4
 
     addi.d    TL,  TL, -1 /* TL-- */
-    blt       ZERO,TL, .L_M2_TL1
+    blt       ZERO,TL, .L_M8_TL1
 
-.L_M2_L7:
-    /* if (!(L & 7)) goto L_M2_L0 */
+.L_M8_TL1_END:
+    KERNEL8x8x4_END
+
+.L_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
     andi      TL,  L,   7
-    beq       TL,  ZERO,.L_M2_L0
+    beq       TL,  ZERO,.L_M8_L0
 
-.L_M2_L71:
+.L_M8_L71:
     xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
 
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
 
     /* Add stride for A0, B0 */
-    addi.d     A0,  A0, 0x10
+    addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x20
 
     addi.d     TL,  TL, -1
-    blt        ZERO,TL, .L_M2_L71
+    blt        ZERO,TL, .L_M8_L71
 
-.L_M2_L0:
+.L_M8_L0:
+    xvldrepl.d  VALPHA, $sp, 112
 #if defined(TRMMKERNEL)
     xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
     xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D5,   D5,  VALPHA
     xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D9,   D9,  VALPHA
     xvfmul.d  D12,  D12, VALPHA
+    xvfmul.d  D13,  D13, VALPHA
 #else
     /* Load C0  */
     xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvfmadd.d D1,  D1,  VALPHA,  U1
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U2
+    xvfmadd.d D5,  D5,  VALPHA,  U3
 
     /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+    xvfmadd.d D8,  D8,  VALPHA,  U4
+    xvfmadd.d D9,  D9,  VALPHA,  U5
 
     /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvfmadd.d D12, D12, VALPHA,  U0
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+    xvfmadd.d D12, D12, VALPHA,  U6
+    xvfmadd.d D13, D13, VALPHA,  U7
 #endif   // #if defined(TRMMKERNEL)
 
-    xvstelm.d D0,  C0,  0x00,    0x00
-    xvstelm.d D4,  C1,  0x00,    0x00
-    xvstelm.d D8,  C2,  0x00,    0x00
-    xvstelm.d D12, C3,  0x00,    0x00
-    xvstelm.d D0,  C0,  0x08,    0x01
-    xvstelm.d D4,  C1,  0x08,    0x01
-    xvstelm.d D8,  C2,  0x08,    0x01
-    xvstelm.d D12, C3,  0x08,    0x01
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    xvst      D1,  C0,  0x20
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    xvst      D5,  C1,  0x20
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    xvst      D9,  C2,  0x20
+    /* Store C3 */
+    xvst      D12, C3,  0x00
+    xvst      D13, C3,  0x20
 
     /* Add stride for C */
-    addi.d    C0,  C0,  0x10
-    addi.d    C1,  C1,  0x10
-    addi.d    C2,  C2,  0x10
-    addi.d    C3,  C3,  0x10
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+    addi.d    C2,  C2,  0x40
+    addi.d    C3,  C3,  0x40
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     sub.d     L,   K,   OFF
 #ifdef LEFT
     /* number of values in A */
-    addi.d    L,   L,   -2
+    addi.d    L,   L,   -8
 #else
     /* number of values in B */
     addi.d    L,   L,   -4
 #endif
-    slli.d    T0,  L,  0x04
+    slli.d    T0,  L,  0x06
     add.d     A0,  A0, T0
     slli.d    T0,  L,  0x05
     add.d     B0,  B0, T0
@@ -1632,23 +1599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef LEFT
     /* number of values in A */
-    addi.d    OFF,   OFF,  0x02
+    addi.d    OFF,   OFF,  0x08
 #endif
 #endif   // #if defined(TRMMKERNEL)
 
-/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
+/********LOOP (if(N >> 2 ) && (M & 8)) End************/
 
-.L_M1:
-    andi      I,   M,   1
-    beq       ZERO,I,   .L_M0
+.L_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
 
 #if defined(TRMMKERNEL)
 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
     move     B0,    B
 #else
-    slli.d   T0,    OFF,  0x03
-    add.d    A0,    A0,   T0
     slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
     add.d    B0,    B,    T0
 #endif
 
@@ -1656,7 +1622,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     sub.d    L,     K,    OFF
 #elif defined(LEFT)
     /* number of values in A */
-    addi.d   L,     OFF,  1
+    addi.d   L,     OFF,  4
 #else
     /* number of values in B */
     addi.d   L,     OFF,  4
@@ -1666,55 +1632,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     move     L,     K /* L = bk */
 #endif
 
-    /* Load 1 * 64 from A0 */
+    /* Load 4 * 64 from A0 */
     xvld     U0,   A0,    0x00
 
     xvldrepl.d     U4, B0, 0x00
     /* line 1 */
     xvfmul.d  D0,  U0, U4
 
-    xvldrepl.d     U4, B0, 0x08
+    xvldrepl.d     U5, B0, 0x08
     /* line 2 */
-    xvfmul.d  D4,  U0, U4
+    xvfmul.d  D4,  U0, U5
 
-    xvldrepl.d     U4, B0, 0x10
+    xvldrepl.d     U6, B0, 0x10
     /* line 3 */
-    xvfmul.d  D8,  U0, U4
+    xvfmul.d  D8,  U0, U6
 
-    xvldrepl.d     U4, B0, 0x18
+    xvldrepl.d     U7, B0, 0x18
     /* line 4 */
-    xvfmul.d  D12, U0, U4
+    xvfmul.d  D12, U0, U7
 
     /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x08
+    addi.d    A0,  A0, 0x20
     addi.d    B0,  B0, 0x20
     /* Reduce L */
     addi.d    L,   L,  -1
     srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_M1_L7 */
-    beq       ZERO,TL, .L_M1_L7
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
 
-.L_M1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    beq    ZERO,    TL,  .L_M4_TL1_END
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.L_M4_TL1: /* TL-- */
+    KERNEL8x4x4
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_TL1
+
+.L_M4_TL1_END:
+    KERNEL8x4x4_END
+
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
 
-           /***8-2***/
+.L_M4_L71:
     xvld     U0,   A0,    0x00
 
     xvldrepl.d U4,  B0, 0x00
@@ -1729,119 +1702,287 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x18
     xvfmadd.d  D12, U0, U4, D12
 
-    addi.d     A0,  A0, 0x08
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x20
 
-           /***8-3***/
-    xvld     U0,   A0,    0x00
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+.L_M4_L0:
+    xvldrepl.d  VALPHA, $sp, 112
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D4,   D4,  VALPHA
+    xvfmul.d  D8,   D8,  VALPHA
+    xvfmul.d  D12,  D12, VALPHA
+#else
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    xvfmadd.d D8,  D8,  VALPHA,  U2
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+    xvfmadd.d D12, D12, VALPHA,  U3
+#endif   // #if defined(TRMMKERNEL)
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    /* Store C0 */
+    xvst      D0,  C0,  0x00
+    /* Store C1 */
+    xvst      D4,  C1,  0x00
+    /* Store C2 */
+    xvst      D8,  C2,  0x00
+    /* Store C3 */
+    xvst      D12, C3,  0x00
 
-           /***8-4***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+    addi.d    C2,  C2,  0x20
+    addi.d    C3,  C3,  0x20
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -4
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
 
-    addi.d     A0,  A0, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvld    U4,     B0,     0x00
+
+    xvfmul.d    D0,     U0,     U4
+    xvfmul.d    D1,     U1,     U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
+
+    addi.d    TL,  TL,  -1
+
+    xvld       U12, B0, 0x00
+    addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x20
 
-           /***8-5***/
-    xvld     U0,   A0,    0x00
+    beq    ZERO,    TL,  .L_M2_TL1_END
+.L_M2_TL1: /* TL-- */
+    KERNEL8x2x4
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+.L_M2_TL1_END:
+    KERNEL8x2x4_END
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+.L_M2_L71:
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvld    U4,  B0, 0x00
 
-    addi.d     A0,  A0, 0x08
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x20
 
-           /***8-6***/
-    xvld     U0,   A0,    0x00
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+.L_M2_L0:
+    xvldrepl.d  VALPHA, $sp, 112
+#if defined(TRMMKERNEL)
+    xvfmul.d  D0,   D0,  VALPHA
+    xvfmul.d  D1,   D1,  VALPHA
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D0,  C1,  0x00,    0x01
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D0,  C3,  0x00,    0x03
+    xvstelm.d D1,  C0,  0x08,    0x00
+    xvstelm.d D1,  C1,  0x08,    0x01
+    xvstelm.d D1,  C2,  0x08,    0x02
+    xvstelm.d D1,  C3,  0x08,    0x03
+#else
+    xvpackev.d  D4,     D1,     D0
+    xvpackod.d  D5,     D1,     D0
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvpermi.q   U2, U0, 0x20
+    xvpermi.q   U3, U1, 0x20
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvfmadd.d   D0, D4, VALPHA, U2
+    xvfmadd.d   D1, D5, VALPHA, U3
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    vst       $vr16,    C0,      0x00
+    vst       $vr17,    C1,      0x00
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D1,  C3,  0x00,    0x02
+    xvstelm.d D0,  C2,  0x08,    0x03
+    xvstelm.d D1,  C3,  0x08,    0x03
+#endif   // #if defined(TRMMKERNEL)
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+    addi.d    C2,  C2,  0x10
+    addi.d    C3,  C3,  0x10
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -2
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmul.d    D0,     U0,     U4
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
 
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
+    xvldrepl.d  U8,     A0,     0x00
 
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    addi.d     TL,  TL,  -1
+    xvld       U12, B0,  0x00
+    addi.d     A0,  A0,  0x08
+    addi.d     B0,  B0,  0x20
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x20
+    beq    ZERO,    TL,  .L_M1_TL1_END
+
+.L_M1_TL1: /* TL-- */
+    KERNEL8x1x4
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_M1_TL1
+.L_M1_TL1_END:
+    KERNEL8x1x4_END
 
 .L_M1_L7:
     /* if (!(L & 7)) goto L_M1_L0 */
@@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     beq       TL,  ZERO,.L_M1_L0
 
 .L_M1_L71:
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    xvldrepl.d U4,  B0, 0x10
-    xvfmadd.d  D8,  U0, U4, D8
-
-    xvldrepl.d U4,  B0, 0x18
-    xvfmadd.d  D12, U0, U4, D12
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmadd.d   D0,     U0,     U4,    D0
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x08
@@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     blt        ZERO,TL, .L_M1_L71
 
 .L_M1_L0:
+    xvldrepl.d  VALPHA, $sp, 112
 #if defined(TRMMKERNEL)
     xvfmul.d  D0,   D0,  VALPHA
-    xvfmul.d  D4,   D4,  VALPHA
-    xvfmul.d  D8,   D8,  VALPHA
-    xvfmul.d  D12,  D12, VALPHA
+
+    xvstelm.d D0,  C0,  0x00,    0x00
+    xvstelm.d D0,  C1,  0x00,    0x01
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D0,  C3,  0x00,    0x03
 #else
     /* Load C0  */
-    xvld      U0,  C0,  0x00
-    xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    xvldrepl.d     U0,  C0,  0x00
+    xvfmadd.d D4,  D0,  VALPHA,  U0
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvldrepl.d     U1,  C1,  0x00
+    xvfmadd.d D5,  D0,  VALPHA,  U1
 
     /* Load C2  */
-    xvld      U0,  C2,  0x00
-    xvfmadd.d D8,  D8,  VALPHA,  U0
+    xvldrepl.d     U2,  C2,  0x00
+    xvfmadd.d D6,  D0,  VALPHA,  U2
 
     /* Load C3  */
-    xvld      U0,  C3,  0x00
-    xvfmadd.d D12, D12, VALPHA,  U0
-#endif   // #if defined(TRMMKERNEL)
+    xvldrepl.d     U3,  C3,  0x00
+    xvfmadd.d D7,  D0,  VALPHA,  U3
 
-    xvstelm.d D0,  C0,  0x00,    0x00
-    xvstelm.d D4,  C1,  0x00,    0x00
-    xvstelm.d D8,  C2,  0x00,    0x00
-    xvstelm.d D12, C3,  0x00,    0x00
+    xvstelm.d D4,  C0,  0x00,    0x00
+    xvstelm.d D5,  C1,  0x00,    0x01
+    xvstelm.d D6,  C2,  0x00,    0x02
+    xvstelm.d D7,  C3,  0x00,    0x03
+#endif   // #if defined(TRMMKERNEL)
 
     /* Add stride for C */
     addi.d    C0,  C0,  0x08
@@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /////////////////////////////////////////////////
 /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/
 
+    xvldrepl.d  VALPHA, $sp, 112
+
 .L_N3:
     andi     J,    N,   2
     beq      ZERO, J,   .L_N1
@@ -1993,223 +2129,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     addi.d   L,     OFF,  2
 #endif
 #else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 16 * 64 from A0
-     * U0 = {a3,  a2,  a1,  a0}
-     * U1 = {a7,  a6,  a5,  a4}
-     * U2 = {a11, a10, a9,  a8}
-     * U3 = {a15, a14, a13, a12}
-     */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
-    xvfmul.d  D2,  U2, U4
-    xvfmul.d  D3,  U3, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
-    xvfmul.d  D6,  U2, U4
-    xvfmul.d  D7,  U3, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x80
-    addi.d    B0,  B0, 0x10
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N3_L7 */
-    beq       ZERO,TL, .L_N3_L7
-
-.L_N3_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-7***/
-    /* Load 16 * 64 from A0 */
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
     xvld     U2,   A0,    0x40
     xvld     U3,   A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
 
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x10
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
 
-           /***8-8***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x80
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_N3_L7
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x10
 
+    beq    ZERO,    TL,  .L_N3_TL1_END
+
+.L_N3_TL1: /* TL-- */
+    KERNEL8x16x2
+
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_TL1
+.L_N3_TL1_END:
+    KERNEL8x16x2_END
 
 .L_N3_L7:
     /* if (!(L & 7)) goto L_N3_L0 */
@@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d  D2,  U2, U4, D2
     xvfmadd.d  D3,  U3, U4, D3
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-    xvfmadd.d  D6,  U2, U4, D6
-    xvfmadd.d  D7,  U3, U4, D7
-
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x10
@@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D3,  D3,  VALPHA,  U3
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvld      U2,  C1,  0x40
-    xvld      U3,  C1,  0x60
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
-    xvfmadd.d D6,  D6,  VALPHA,  U2
-    xvfmadd.d D7,  D7,  VALPHA,  U3
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+    xvfmadd.d D4,  D4,  VALPHA,  U4
+    xvfmadd.d D5,  D5,  VALPHA,  U5
+    xvfmadd.d D6,  D6,  VALPHA,  U6
+    xvfmadd.d D7,  D7,  VALPHA,  U7
 #endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
@@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmul.d  D0,  U0, U4
     xvfmul.d  D1,  U1, U4
 
-    xvldrepl.d     U4, B0, 0x08
+    xvldrepl.d     U5, B0, 0x08
     /* line 2 */
-    xvfmul.d  D4,  U0, U4
-    xvfmul.d  D5,  U1, U4
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
 
     /* Add stride for A0 and B0 */
     addi.d    A0,  A0, 0x40
@@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N3_M8_L7 */
     beq       ZERO,TL, .L_N3_M8_L7
 
-.L_N3_M8_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x10
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    beq    ZERO,    TL,  .L_N3_M8_TL1_END
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x10
+.L_N3_M8_TL1: /* TL-- */
+    KERNEL8x8x2
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M8_TL1
+.L_N3_M8_TL1_END:
+    KERNEL8x8x2_END
 
 .L_N3_M8_L7:
     /* if (!(L & 7)) goto L_N3_M8_L0 */
@@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d  D0,  U0, U4, D0
     xvfmadd.d  D1,  U1, U4, D1
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-    xvfmadd.d  D5,  U1, U4, D5
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x40
@@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D1,  D1,  VALPHA,  U1
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvld      U1,  C1,  0x20
-    xvfmadd.d D4,  D4,  VALPHA,  U0
-    xvfmadd.d D5,  D5,  VALPHA,  U1
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+    xvfmadd.d D4,  D4,  VALPHA,  U2
+    xvfmadd.d D5,  D5,  VALPHA,  U3
 #endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
@@ -2561,162 +2432,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     add.d     B0,  B0, T0
 #endif
 
-#ifdef LEFT
-    addi.d    OFF,   OFF,  0x08
-#endif
-#endif   // #if defined(TRMMKERNEL)
-
-/********LOOP (if(N & 2) && (M & 8) ) End************/
-
-.L_N3_M4:
-    andi      I,   M,   4
-    beq       ZERO,I,   .L_N3_M2
-
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x05
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x04
-    add.d    B0,    B,    T0
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  4
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  2
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 4 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x20
-    addi.d    B0,  B0, 0x10
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N3_M4_L7 */
-    beq       ZERO,TL, .L_N3_M4_L7
-
-.L_N3_M4_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 8 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-           /***8-5***/
-    xvld     U0,   A0,    0x00
+/********LOOP (if(N & 2) && (M & 8) ) End************/
 
-    /* Cumulative D0~D15 */
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+.L_N3_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N3_M2
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-6***/
+    /* Load 4 * 64 from A0 */
     xvld     U0,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M4_L7 */
+    beq       ZERO,TL, .L_N3_M4_L7
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x10
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    beq    ZERO,    TL,  .L_N3_M4_TL1_END
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x10
+.L_N3_M4_TL1: /* TL-- */
+    KERNEL8x4x2
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M4_TL1
+.L_N3_M4_TL1_END:
+    KERNEL8x4x2_END
 
 .L_N3_M4_L7:
     /* if (!(L & 7)) goto L_N3_M4_L0 */
@@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x20
@@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 #endif // #if defined(TRMMKERNEL)
 
     /* Store C0 */
@@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N3_M2_L7 */
     beq       ZERO,TL, .L_N3_M2_L7
 
-.L_N3_M2_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 2 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
+    xvld     U8,   A0,    0x00
 
-           /***8-7***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x10
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    beq    ZERO,    TL,  .L_N3_M2_TL1_END
 
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x10
+.L_N3_M2_TL1: /* TL-- */
+    KERNEL8x2x2
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M2_TL1
+.L_N3_M2_TL1_END:
+    KERNEL8x2x2_END
 
 .L_N3_M2_L7:
     /* if (!(L & 7)) goto L_N3_M2_L0 */
@@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x10
@@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 #endif // #if defined(TRMMKERNEL)
 
     xvstelm.d D0,  C0,  0x00,    0x00
@@ -3017,132 +2723,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
     /* number of values in B */
     addi.d   L,     OFF,  2
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-
-    xvldrepl.d     U4, B0, 0x08
-    /* line 2 */
-    xvfmul.d  D4,  U0, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x08
-    addi.d    B0,  B0, 0x10
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N3_M1_L7 */
-    beq       ZERO,TL, .L_N3_M1_L7
-
-.L_N3_M1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-7***/
+    /* Load 1 * 64 from A0 */
     xvld     U0,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x10
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_N3_M1_L7
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
     addi.d     A0,  A0, 0x08
     addi.d     B0,  B0, 0x10
 
+    beq    ZERO,    TL,  .L_N3_M1_TL1_END
+
+.L_N3_M1_TL1: /* TL-- */
+    KERNEL8x1x2
+
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N3_M1_TL1
+.L_N3_M1_TL1_END:
+    KERNEL8x1x2_END
 
 .L_N3_M1_L7:
     /* if (!(L & 7)) goto L_N3_M1_L0 */
@@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvldrepl.d U4,  B0, 0x00
     xvfmadd.d  D0,  U0, U4, D0
 
-    xvldrepl.d U4,  B0, 0x08
-    xvfmadd.d  D4,  U0, U4, D4
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
 
     /* Add stride for A0, B0 */
     addi.d     A0,  A0, 0x08
@@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     xvfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
 
     /* Load C1  */
-    xvld      U0,  C1,  0x00
-    xvfmadd.d D4,  D4,  VALPHA,  U0
+    xvld      U1,  C1,  0x00
+    xvfmadd.d D4,  D4,  VALPHA,  U1
 #endif // #if defined(TRMMKERNEL)
 
     xvstelm.d D0,  C0,  0x00,    0x00
@@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_L7 */
     beq       ZERO,TL, .L_N1_L7
 
-.L_N1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x80
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-    xvld     U2,   A0,    0x40
-    xvld     U3,   A0,    0x60
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-    xvfmadd.d  D2,  U2, U4, D2
-    xvfmadd.d  D3,  U3, U4, D3
-
-    addi.d     A0,  A0, 0x80
-    addi.d     B0,  B0, 0x08
+    beq    ZERO,    TL,  .L_N1_TL1_END
+.L_N1_TL1: /* TL-- */
+    KERNEL8x16x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_TL1
+.L_N1_TL1_END:
+    KERNEL8x16x1_END
 
 .L_N1_L7:
     /* if (!(L & 7)) goto L_N1_L0 */
@@ -3494,161 +3006,87 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
     slli.d    T0,  L,  0x07
     add.d     A0,  A0, T0
-    slli.d    T0,  L,  0x03
-    add.d     B0,  B0, T0
-#endif
-
-#ifdef LEFT
-    addi.d    OFF,   OFF,  0x10
-#endif
-#endif   // #if defined(TRMMKERNEL)
-
-    addi.d    I,   I,   -1  /* I-- */
-    blt       ZERO,I,   .L_N1_I1
-
-.L_N1_M8:
-    /* We have done M & 16, considering M=8/4/2/1 */
-    andi      I,   M,   15
-    beq       ZERO,I,   .L_N1_M0
-
-    andi      I,   M,   8
-    beq       ZERO,I,   .L_N1_M4
-
-#if defined(TRMMKERNEL)
-#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
-    move     B0,    B
-#else
-    slli.d   T0,    OFF,  0x06
-    add.d    A0,    A0,   T0
-    slli.d   T0,    OFF,  0x03
-    add.d    B0,    B,    T0
-#endif
-
-#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
-    sub.d    L,     K,    OFF
-#elif defined(LEFT)
-    /* number of values in A */
-    addi.d   L,     OFF,  8
-#else
-    /* number of values in B */
-    addi.d   L,     OFF,  1
-#endif
-#else   // #if !defined(TRMMKERNEL)
-    move     B0,    B
-    move     L,     K /* L = bk */
-#endif
-
-    /* Load 8 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d     U4, B0, 0x00
-    /* line 1 */
-    xvfmul.d  D0,  U0, U4
-    xvfmul.d  D1,  U1, U4
-
-    /* Add stride for A0 and B0 */
-    addi.d    A0,  A0, 0x40
-    addi.d    B0,  B0, 0x08
-    /* Reduce L */
-    addi.d    L,   L,  -1
-    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
-    /* if (TL < 1) goto L_N1_M8_L7 */
-    beq       ZERO,TL, .L_N1_M8_L7
-
-.L_N1_M8_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 16 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
-
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
 
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_I1
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N1_M0
 
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N1_M4
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
 
-           /***8-7***/
+    /* Load 8 * 64 from A0 */
     xvld     U0,   A0,    0x00
     xvld     U1,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
 
-    addi.d     A0,  A0, 0x40
-    addi.d     B0,  B0, 0x08
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M8_L7 */
+    beq       ZERO,TL, .L_N1_M8_L7
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-    xvld     U1,   A0,    0x20
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-    xvfmadd.d  D1,  U1, U4, D1
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x40
     addi.d     B0,  B0, 0x08
 
+    beq    ZERO,    TL,  .L_N1_M8_TL1_END
+.L_N1_M8_TL1: /* TL-- */
+    KERNEL8x8x1
+
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M8_TL1
 
+.L_N1_M8_TL1_END:
+    KERNEL8x8x1_END
+
 .L_N1_M8_L7:
     /* if (!(L & 7)) goto L_N1_M8_L0 */
     andi      TL,  L,   7
@@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_M4_L7 */
     beq       ZERO,TL, .L_N1_M4_L7
 
-.L_N1_M4_TL1: /* TL-- */
-           /***8-1***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x20
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    beq    ZERO,    TL,  .L_N1_M4_TL1_END
 
-    addi.d     A0,  A0, 0x20
-    addi.d     B0,  B0, 0x08
+.L_N1_M4_TL1: /* TL-- */
+    KERNEL8x4x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M4_TL1
+.L_N1_M4_TL1_END:
+    KERNEL8x4x1_END
 
 .L_N1_M4_L7:
     /* if (!(L & 7)) goto L_N1_M4_L0 */
@@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_M2_L7 */
     beq       ZERO,TL, .L_N1_M2_L7
 
-.L_N1_M2_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 2 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x10
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    beq    ZERO,    TL,  .L_N1_M2_TL1_END
 
-    addi.d     A0,  A0, 0x10
-    addi.d     B0,  B0, 0x08
+.L_N1_M2_TL1: /* TL-- */
+    KERNEL8x2x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M2_TL1
+.L_N1_M2_TL1_END:
+    KERNEL8x2x1_END
 
 .L_N1_M2_L7:
     /* if (!(L & 7)) goto L_N1_M2_L0 */
@@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     /* if (TL < 1) goto L_N1_M1_L7 */
     beq       ZERO,TL, .L_N1_M1_L7
 
-.L_N1_M1_TL1: /* TL-- */
-           /***8-1***/
-    /* Load 1 * 64 from A0 */
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-2***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-3***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-4***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-5***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-6***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
-
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
-
-           /***8-7***/
-    xvld     U0,   A0,    0x00
+    xvld     U8,   A0,    0x00
 
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    addi.d    TL,  TL,  -1
 
+    xvldrepl.d  U12,   B0,  0x00
     addi.d     A0,  A0, 0x08
     addi.d     B0,  B0, 0x08
 
-           /***8-8***/
-    xvld     U0,   A0,    0x00
-
-    xvldrepl.d U4,  B0, 0x00
-    xvfmadd.d  D0,  U0, U4, D0
+    beq    ZERO,    TL,  .L_N1_M1_TL1_END
 
-    addi.d     A0,  A0, 0x08
-    addi.d     B0,  B0, 0x08
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8x1x1
 
     addi.d    TL,  TL, -1 /* TL-- */
     blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_TL1_END:
+    KERNEL8x1x1_END
 
 .L_N1_M1_L7:
     /* if (!(L & 7)) goto L_N1_M1_L0 */
@@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     LDARG    $r26,  $sp,   24
     LDARG    $r27,  $sp,   32
     LD       $f23,  $sp,   40
-    addi.d   $sp,   $sp,   56
+    LD       $f24,  $sp,   48
+    LD       $f25,  $sp,   56
+    LD       $f26,  $sp,   64
+    LD       $f27,  $sp,   72
+    LD       $f28,  $sp,   80
+    LD       $f29,  $sp,   88
+    LD       $f30,  $sp,   96
+    LD       $f31,  $sp,   104
+    addi.d   $sp,   $sp,   120
 
     jirl    $r0, $r1, 0x0
 
diff --git a/kernel/loongarch64/dgemm_kernel_8x4.S b/kernel/loongarch64/dgemm_kernel_8x4.S
new file mode 100644
index 000000000..405f1bd97
--- /dev/null
+++ b/kernel/loongarch64/dgemm_kernel_8x4.S
@@ -0,0 +1,2894 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA  $f0   // param 4: alpha
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+
+#ifdef TRMMKERNEL
+#define OFFSET $r11  // param 9: offset
+#endif
+#define OFF    $r12
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25 /* !! DO NOT USE $r21 and $r22 !! */
+#define T1     $r26
+#define T2     $r27
+#define ZERO   $r0
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define D12    $vr28
+#define D13    $vr29
+#define D14    $vr30
+#define D15    $vr31
+#define VALPHA $vr15
+
+/* Prefetch interval */
+#define A_PRE  0x200
+#define B_PRE  0x100
+
+.macro KERNEL2x8x4
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    preld     0,   B0, B_PRE
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D10, U10, U14, D10
+    vfmadd.d  D11, U11, U14, D11
+
+    preld     0,   A0, A_PRE
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D14, U10, U15, D14
+    vfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    vld     U8,   A0,    0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vld     U9,   A0,    0x10
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vld     U10,  A0,    0x20
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vld     U11,  A0,    0x30
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vldrepl.d U12,  B0, 0x00
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    preld     0,   B0, B_PRE
+    vldrepl.d U13,  B0, 0x08
+    vfmadd.d  D10, U2, U6, D10
+    vfmadd.d  D11, U3, U6, D11
+
+    preld     0,   A0, A_PRE
+    vldrepl.d U14,  B0, 0x10
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vldrepl.d U15,  B0, 0x18
+    vfmadd.d  D14, U2, U7, D14
+    vfmadd.d  D15, U3, U7, D15
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x8x4_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    preld     0,   B0, B_PRE
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D10, U10, U14, D10
+    vfmadd.d  D11, U11, U14, D11
+
+    preld     0,   A0, A_PRE
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D14, U10, U15, D14
+    vfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    preld     0,   B0, B_PRE
+    vfmadd.d  D10, U2, U6, D10
+    vfmadd.d  D11, U3, U6, D11
+
+    preld     0,   A0, A_PRE
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    preld     0,   A0, A_PRE + 0x40
+    vfmadd.d  D14, U2, U7, D14
+    vfmadd.d  D15, U3, U7, D15
+.endm
+
+.macro KERNEL8x8x4
+.rept 4
+    KERNEL2x8x4
+.endr
+.endm
+
+.macro KERNEL8x8x4_END
+.rept 3
+    KERNEL2x8x4
+.endr
+    KERNEL2x8x4_END
+.endm
+
+.macro KERNEL2x4x4
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    vldrepl.d U12, B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vldrepl.d U13, B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vldrepl.d U14, B0, 0x10
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    vldrepl.d U15,  B0, 0x18
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x4_END
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U8, U14, D8
+    vfmadd.d  D9,  U9, U14, D9
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U8, U15, D12
+    vfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+.endm
+
+.macro KERNEL8x4x4
+.rept 4
+    KERNEL2x4x4
+.endr
+.endm
+
+.macro KERNEL8x4x4_END
+.rept 3
+    KERNEL2x4x4
+.endr
+    KERNEL2x4x4_END
+.endm
+
+.macro KERNEL2x2x4
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vfmadd.d   D2,     U9,     U12,    D2
+    vfmadd.d   D3,     U9,     U13,    D3
+
+    vld    U4,  B0, 0x00
+    vld    U5,  B0, 0x10
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    vldrepl.d     U8,   A0,    0x00
+    vldrepl.d     U9,   A0,    0x08
+
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+    vfmadd.d   D2,     U1,     U4,    D2
+    vfmadd.d   D3,     U1,     U5,    D3
+
+    vld       U12, B0, 0x00
+    vld       U13, B0, 0x10
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x2x4_END
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vfmadd.d   D2,     U9,     U12,    D2
+    vfmadd.d   D3,     U9,     U13,    D3
+
+    vld     U4,  B0, 0x00
+    vld     U5,  B0, 0x10
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+    vfmadd.d   D2,     U1,     U4,    D2
+    vfmadd.d   D3,     U1,     U5,    D3
+.endm
+
+.macro KERNEL8x2x4
+.rept 4
+    KERNEL2x2x4
+.endr
+.endm
+
+.macro KERNEL8x2x4_END
+.rept 3
+    KERNEL2x2x4
+.endr
+    KERNEL2x2x4_END
+.endm
+
+.macro KERNEL2x1x4
+    vldrepl.d  U0,     A0,     0x00
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+
+    vldrepl.d  U8,     A0,     0x08
+    vfmadd.d   D0,     U0,     U4,     D0
+    vfmadd.d   D1,     U0,     U5,     D1
+    vld        U12,    B0,     0x20
+    vld        U13,    B0,     0x30
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x40
+.endm
+
+.macro KERNEL2x1x4_END
+    vldrepl.d  U0,     A0,     0x00
+    vfmadd.d   D0,     U8,     U12,    D0
+    vfmadd.d   D1,     U8,     U13,    D1
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    vfmadd.d   D0,     U0,     U4,     D0
+    vfmadd.d   D1,     U0,     U5,     D1
+.endm
+
+.macro KERNEL8x1x4
+.rept 4
+    KERNEL2x1x4
+.endr
+.endm
+
+.macro KERNEL8x1x4_END
+.rept 3
+    KERNEL2x1x4
+.endr
+    KERNEL2x1x4_END
+.endm
+
+.macro KERNEL2x8x2
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    vld     U8,   A0,    0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vld     U9,   A0,    0x10
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vld     U10,   A0,   0x20
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vld     U11,   A0,   0x30
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vldrepl.d U12,  B0, 0x00
+    vldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x8x2_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vld     U3,   A0,    0x30
+    vfmadd.d  D6,  U10, U13, D6
+    vfmadd.d  D7,  U11, U13, D7
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+.endm
+
+.macro KERNEL8x8x2
+.rept 4
+    KERNEL2x8x2
+.endr
+.endm
+
+.macro KERNEL8x8x2_END
+.rept 3
+    KERNEL2x8x2
+.endr
+    KERNEL2x8x2_END
+.endm
+
+.macro KERNEL2x4x2
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    vld     U8,   A0,    0x20
+    vld     U9,   A0,    0x30
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vldrepl.d U12,  B0, 0x10
+    vldrepl.d U13,  B0, 0x18
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x2_END
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vfmadd.d  D4,  U8, U13, D4
+    vfmadd.d  D5,  U9, U13, D5
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+.endm
+
+.macro KERNEL8x4x2
+.rept 4
+    KERNEL2x4x2
+.endr
+.endm
+
+.macro KERNEL8x4x2_END
+.rept 3
+    KERNEL2x4x2
+.endr
+    KERNEL2x4x2_END
+.endm
+
+.macro KERNEL2x2x2
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D4,  U8, U13, D4
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    vld     U8,   A0,    0x10
+    vldrepl.d U12,  B0, 0x10
+    vldrepl.d U13,  B0, 0x18
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D4,  U0, U5, D4
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x2x2_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D4,  U8, U13, D4
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x2x2
+.rept 4
+    KERNEL2x2x2
+.endr
+.endm
+
+.macro KERNEL8x2x2_END
+.rept 3
+    KERNEL2x2x2
+.endr
+    KERNEL2x2x2_END
+.endm
+
+.macro KERNEL2x1x2
+    vldrepl.d U0,  A0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+
+    vld       U4,  B0, 0x00
+    vldrepl.d U8,  A0, 0x08
+    vld       U12, B0, 0x10
+    vfmadd.d  D0,  U0, U4, D0
+
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x1x2_END
+    vldrepl.d U0,  A0, 0x00
+    vfmadd.d  D0,  U8, U12, D0
+
+    vld       U4,  B0, 0x00
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+
+    vfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x1x2
+.rept 4
+    KERNEL2x1x2
+.endr
+.endm
+
+.macro KERNEL8x1x2_END
+.rept 3
+    KERNEL2x1x2
+.endr
+    KERNEL2x1x2_END
+.endm
+
+.macro KERNEL2x8x1
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vldrepl.d U4,  B0, 0x00
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vld     U8,   A0,    0x40
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vld     U9,   A0,    0x50
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vld     U10,   A0,    0x60
+    vld     U11,   A0,    0x70
+
+    vldrepl.d U12,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x8x1_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+
+    vld     U1,   A0,    0x10
+    vfmadd.d  D2,  U10, U12, D2
+    vfmadd.d  D3,  U11, U12, D3
+
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+.endm
+
+.macro KERNEL8x8x1
+.rept 4
+    KERNEL2x8x1
+.endr
+.endm
+
+.macro KERNEL8x8x1_END
+.rept 3
+    KERNEL2x8x1
+.endr
+    KERNEL2x8x1_END
+.endm
+
+.macro KERNEL2x4x1
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vldrepl.d U4,  B0, 0x00
+
+    vld     U8,   A0,    0x20
+    vld     U9,   A0,    0x30
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vldrepl.d U12,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x4x1_END
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vfmadd.d  D0,  U8, U12, D0
+    vfmadd.d  D1,  U9, U12, D1
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+.endm
+
+.macro KERNEL8x4x1
+.rept 4
+    KERNEL2x4x1
+.endr
+.endm
+
+.macro KERNEL8x4x1_END
+.rept 3
+    KERNEL2x4x1
+.endr
+    KERNEL2x4x1_END
+.endm
+
+.macro KERNEL2x2x1
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    vld     U8,   A0,    0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x2x1_END
+    vld     U0,   A0,    0x00
+    vfmadd.d  D0,  U8, U12, D0
+    vldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x2x1
+.rept 4
+    KERNEL2x2x1
+.endr
+.endm
+
+.macro KERNEL8x2x1_END
+.rept 3
+    KERNEL2x2x1
+.endr
+    KERNEL2x2x1_END
+.endm
+
+.macro KERNEL2x1x1
+    vldrepl.d  U0,  A0,    0x00
+    vfmadd.d   D0,  U8, U12, D0
+    vldrepl.d  U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    vldrepl.d  U8,  A0,    0x00
+    vfmadd.d   D0,  U0, U4, D0
+    vldrepl.d  U12, B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x1x1_END
+    vldrepl.d  U0,  A0,    0x00
+    vfmadd.d   D0,  U8, U12, D0
+    vldrepl.d  U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    vfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x1x1
+.rept 4
+    KERNEL2x1x1
+.endr
+.endm
+
+.macro KERNEL8x1x1_END
+.rept 3
+    KERNEL2x1x1
+.endr
+    KERNEL2x1x1_END
+.endm
+
+
+    PROLOGUE
+
+    addi.d   $sp,   $sp,   -112
+    /* Store regs */
+    SDARG    $r23,  $sp,   0
+    SDARG    $r24,  $sp,   8
+    SDARG    $r25,  $sp,   16
+    SDARG    $r26,  $sp,   24
+    SDARG    $r27,  $sp,   32
+    ST       $f24,  $sp,   40
+    ST       $f25,  $sp,   48
+    ST       $f26,  $sp,   56
+    ST       $f27,  $sp,   64
+    ST       $f28,  $sp,   72
+    ST       $f29,  $sp,   80
+    ST       $f30,  $sp,   88
+    ST       $f31,  $sp,   96
+    ST       ALPHA, $sp,   104
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
+
+    /* if (!(N >> 2)) goto L_N3 */
+    srai.d   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    vldrepl.d  VALPHA, $sp, 104 /* When N < 4, VALPHA will not changed */
+    beq      ZERO,  J,     .L_N3
+
+.L_J1: /* J-- && This loop include Condition 1 */
+
+/************************* Condition 1 if((N >> 2) && (M >> 3)) START !!! *************************
+*                                                   dgemm_core_16x4                                */
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+    addi.d   J,     J,     -1   /* J-- */
+    add.d    C2,    C1,    T0
+    add.d    C3,    C2,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 3)) goto L_M8 */
+    srai.d   I,     M,     3     /* I = bm >> 3 */
+    beq      ZERO,  I,     .L_M8
+
+.L_I1: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    /* Calculate the first set of D0~D15,
+     * avoidig set 0 operation
+     * Load 8 * 64 from A0
+     * U0 = {a1,  a0}
+     * U1 = {a3,  a2}
+     * U2 = {a5,  a4}
+     * U3 = {a7,  a6}
+     */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d     U4, B0, 0x00
+    preld         0,  C0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+    preld    0,   C0, 0x20
+    vfmul.d  D2,  U2, U4
+    vfmul.d  D3,  U3, U4
+
+    vldrepl.d     U5, B0, 0x08
+    preld         0,  C1, 0x00
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+    preld    0,   C1, 0x20
+    vfmul.d  D6,  U2, U5
+    vfmul.d  D7,  U3, U5
+
+    vldrepl.d     U6, B0, 0x10
+    preld         0,  C2, 0x00
+    /* line 3 */
+    vfmul.d  D8,  U0, U6
+    vfmul.d  D9,  U1, U6
+    preld    0,   C2, 0x20
+    vfmul.d  D10, U2, U6
+    vfmul.d  D11, U3, U6
+
+    vldrepl.d     U7, B0, 0x18
+    preld         0,  C3, 0x00
+    /* line 4 */
+    vfmul.d  D12, U0, U7
+    vfmul.d  D13, U1, U7
+    preld    0,   C3, 0x20
+    vfmul.d  D14, U2, U7
+    vfmul.d  D15, U3, U7
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+    vld     U10,  A0,    0x20
+    vld     U11,  A0,    0x30
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    vldrepl.d  U14,   B0,  0x10
+    vldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_TL1_END
+.L_TL1: /* TL-- */
+    KERNEL8x8x4
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_TL1
+
+.L_TL1_END:
+    KERNEL8x8x4_END
+
+   /* Maybe we need calculate the last
+    * 7 sets of D0~D15?
+    */
+.L_L7:
+    /* if (!(L & 7)) goto L_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_L0
+
+.L_L71:
+    /* Load 16 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    /* Cumulative D0~D15 */
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+    vfmadd.d  D10, U2, U6, D10
+    vfmadd.d  D11, U3, U6, D11
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+    vfmadd.d  D14, U2, U7, D14
+    vfmadd.d  D15, U3, U7, D15
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_L71
+
+.L_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+    vfmul.d  D6,   D6,  VALPHA
+    vfmul.d  D7,   D7,  VALPHA
+    vfmul.d  D8,   D8,  VALPHA
+    vfmul.d  D9,   D9,  VALPHA
+    vfmul.d  D10,  D10, VALPHA
+    vfmul.d  D11,  D11, VALPHA
+    vfmul.d  D12,  D12, VALPHA
+    vfmul.d  D13,  D13, VALPHA
+    vfmul.d  D14,  D14, VALPHA
+    vfmul.d  D15,  D15, VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vld      U2,  C0,  0x20
+    vld      U3,  C0,  0x30
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+    vfmadd.d D2,  D2,  VALPHA,  U2
+    vfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    vld      U4,  C1,  0x00
+    vld      U5,  C1,  0x10
+    vld      U6,  C1,  0x20
+    vld      U7,  C1,  0x30
+    vfmadd.d D4,  D4,  VALPHA,  U4
+    vfmadd.d D5,  D5,  VALPHA,  U5
+    vfmadd.d D6,  D6,  VALPHA,  U6
+    vfmadd.d D7,  D7,  VALPHA,  U7
+
+    /* Load C2  */
+    vld      U8,  C2,  0x00
+    vld      U9,  C2,  0x10
+    vld      U10, C2,  0x20
+    vld      U11, C2,  0x30
+    vfmadd.d D8,  D8,  VALPHA,  U8
+    vfmadd.d D9,  D9,  VALPHA,  U9
+    vfmadd.d D10, D10, VALPHA,  U10
+    vfmadd.d D11, D11, VALPHA,  U11
+
+    /* Load C3  */
+    vld      U0,  C3,  0x00
+    vld      U1,  C3,  0x10
+    vld      U2,  C3,  0x20
+    vld      U3,  C3,  0x30
+    vfmadd.d D12, D12, VALPHA,  U0
+    vfmadd.d D13, D13, VALPHA,  U1
+    vfmadd.d D14, D14, VALPHA,  U2
+    vfmadd.d D15, D15, VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    vst      D2,  C0,  0x20
+    vst      D3,  C0,  0x30
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+    vst      D6,  C1,  0x20
+    vst      D7,  C1,  0x30
+    /* Store C2 */
+    vst      D8,  C2,  0x00
+    vst      D9,  C2,  0x10
+    vst      D10, C2,  0x20
+    vst      D11, C2,  0x30
+    /* Store C3 */
+    vst      D12, C3,  0x00
+    vst      D13, C3,  0x10
+    vst      D14, C3,  0x20
+    vst      D15, C3,  0x30
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+    addi.d    C2,  C2,  0x40
+    addi.d    C3,  C3,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values  in A */
+    addi.d    L,   L,   -8
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF, OFF, 0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_I1
+
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   7
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+
+    vldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+
+    vldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    vfmul.d  D8,  U0, U6
+    vfmul.d  D9,  U1, U6
+
+    vldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    vfmul.d  D12, U0, U7
+    vfmul.d  D13, U1, U7
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    vldrepl.d  U14,   B0,  0x10
+    vldrepl.d  U15,   B0,  0x18
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_M4_TL1_END
+
+.L_M4_TL1: /* TL-- */
+    KERNEL8x4x4
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_TL1
+
+.L_M4_TL1_END:
+    KERNEL8x4x4_END
+
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
+
+.L_M4_L71:
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    vldrepl.d U6,  B0, 0x10
+    vfmadd.d  D8,  U0, U6, D8
+    vfmadd.d  D9,  U1, U6, D9
+
+    vldrepl.d U7,  B0, 0x18
+    vfmadd.d  D12, U0, U7, D12
+    vfmadd.d  D13, U1, U7, D13
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
+
+.L_M4_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+    vfmul.d  D8,   D8,  VALPHA
+    vfmul.d  D9,   D9,  VALPHA
+    vfmul.d  D12,  D12, VALPHA
+    vfmul.d  D13,  D13, VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    vld      U2,  C1,  0x00
+    vld      U3,  C1,  0x10
+    vfmadd.d D4,  D4,  VALPHA,  U2
+    vfmadd.d D5,  D5,  VALPHA,  U3
+
+    /* Load C2  */
+    vld      U4,  C2,  0x00
+    vld      U5,  C2,  0x10
+    vfmadd.d D8,  D8,  VALPHA,  U4
+    vfmadd.d D9,  D9,  VALPHA,  U5
+
+    /* Load C3  */
+    vld      U6,  C3,  0x00
+    vld      U7,  C3,  0x10
+    vfmadd.d D12, D12, VALPHA,  U6
+    vfmadd.d D13, D13, VALPHA,  U7
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+    /* Store C2 */
+    vst      D8,  C2,  0x00
+    vst      D9,  C2,  0x10
+    /* Store C3 */
+    vst      D12, C3,  0x00
+    vst      D13, C3,  0x10
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+    addi.d    C2,  C2,  0x20
+    addi.d    C3,  C3,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -4
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 4) ) End************/
+
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vld    U4,     B0,     0x00
+    vld    U5,     B0,     0x10
+
+    vfmul.d    D0,     U0,     U4
+    vfmul.d    D1,     U0,     U5
+    vfmul.d    D2,     U1,     U4
+    vfmul.d    D3,     U1,     U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+
+    vldrepl.d     U8,   A0,    0x00
+    vldrepl.d     U9,   A0,    0x08
+
+    addi.d    TL,  TL,  -1
+
+    vld       U12, B0, 0x00
+    vld       U13, B0, 0x10
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_M2_TL1_END
+.L_M2_TL1: /* TL-- */
+    KERNEL8x2x4
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+.L_M2_TL1_END:
+    KERNEL8x2x4_END
+
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
+
+.L_M2_L71:
+    vldrepl.d     U0,   A0,    0x00
+    vldrepl.d     U1,   A0,    0x08
+
+    vld    U4,  B0, 0x00
+    vld    U5,  B0, 0x10
+
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+    vfmadd.d   D2,     U1,     U4,    D2
+    vfmadd.d   D3,     U1,     U5,    D3
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
+
+.L_M2_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+
+    vstelm.d  D0,  C0,  0x00,    0x00
+    vstelm.d  D0,  C1,  0x00,    0x01
+    vstelm.d  D1,  C2,  0x00,    0x00
+    vstelm.d  D1,  C3,  0x00,    0x01
+    vstelm.d  D2,  C0,  0x08,    0x00
+    vstelm.d  D2,  C1,  0x08,    0x01
+    vstelm.d  D3,  C2,  0x08,    0x00
+    vstelm.d  D3,  C3,  0x08,    0x01
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    /* Load C1  */
+    vld      U1,  C1,  0x00
+    /* Load C2  */
+    vld      U2,  C2,  0x00
+    /* Load C3  */
+    vld      U3,  C3,  0x00
+
+    vilvl.d  D4,  D2,  D0   //C0
+    vilvh.d  D5,  D2,  D0   //C1
+    vilvl.d  D6,  D3,  D1   //C2
+    vilvh.d  D7,  D3,  D1   //C3
+
+    vfmadd.d D0,  D4,  VALPHA,  U0
+    vfmadd.d D2,  D5,  VALPHA,  U1
+    vfmadd.d D1,  D6,  VALPHA,  U2
+    vfmadd.d D3,  D7,  VALPHA,  U3
+
+    vst      D0,  C0,  0x00
+    vst      D2,  C1,  0x00
+    vst      D1,  C2,  0x00
+    vst      D3,  C3,  0x00
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+    addi.d    C2,  C2,  0x10
+    addi.d    C3,  C3,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -2
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 2) ) End************/
+
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x05
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    vldrepl.d  U0,     A0,     0x00
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+    vfmul.d    D0,     U0,     U4
+    vfmul.d    D1,     U0,     U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x20
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
+
+    vldrepl.d  U8,     A0,     0x00
+
+    addi.d     TL,  TL,  -1
+    vld        U12, B0,  0x00
+    vld        U13, B0,  0x10
+    addi.d     A0,  A0,  0x08
+    addi.d     B0,  B0,  0x20
+
+    beq    ZERO,    TL,  .L_M1_TL1_END
+
+.L_M1_TL1: /* TL-- */
+    KERNEL8x1x4
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M1_TL1
+.L_M1_TL1_END:
+    KERNEL8x1x4_END
+
+.L_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M1_L0
+
+.L_M1_L71:
+    vldrepl.d  U0,     A0,     0x00
+    vld        U4,     B0,     0x00
+    vld        U5,     B0,     0x10
+    vfmadd.d   D0,     U0,     U4,    D0
+    vfmadd.d   D1,     U0,     U5,    D1
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_M1_L71
+
+.L_M1_L0:
+    vldrepl.d  VALPHA, $sp, 104
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+
+    vstelm.d D0,  C0,  0x00,    0x00
+    vstelm.d D0,  C1,  0x00,    0x01
+    vstelm.d D1,  C2,  0x00,    0x00
+    vstelm.d D1,  C3,  0x00,    0x01
+#else
+    /* Load C0  */
+    vldrepl.d     U0,  C0,  0x00
+    vldrepl.d     U1,  C1,  0x00
+    vilvl.d       D4,  U1,  U0
+    vfmadd.d D6,  D0,  VALPHA,  D4
+
+    vldrepl.d     U2,  C2,  0x00
+    vldrepl.d     U3,  C3,  0x00
+    vilvl.d       D5,  U3,  U2
+    vfmadd.d D7,  D1,  VALPHA,  D5
+
+    vstelm.d D6,  C0,  0x00,    0x00
+    vstelm.d D6,  C1,  0x00,    0x01
+    vstelm.d D7,  C2,  0x00,    0x00
+    vstelm.d D7,  C3,  0x00,    0x01
+#endif   // #if defined(TRMMKERNEL)
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+    addi.d    C2,  C2,  0x08
+    addi.d    C3,  C3,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    L,   L,   -1
+#else
+    /* number of values in B */
+    addi.d    L,   L,   -4
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x05
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N >> 2 ) && (M & 1) ) End************/
+
+.L_M0:
+    /* Add stride for B and C
+     * B += (K * 32)
+     * C += (LDC * 32)
+     */
+    /* since the array type is double,
+     * so we must mul 32
+     */
+    slli.d    T0,   K,   5
+    slli.d    T1,   LDC, 5
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x04
+#endif
+
+    blt      ZERO,  J,   .L_J1
+
+//////////////// go back to L_J1 /////////////////
+/////////////////////////////////////////////////
+/************************ Condition 1 if((N >> 2) && (M >> 3)) END !!! ************************/
+
+    vldrepl.d  VALPHA, $sp, 104
+
+.L_N3:
+    andi     J,    N,   2
+    beq      ZERO, J,   .L_N1
+
+/************************* Condition 2 if((N & 2) && (M >> 3)) START !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+    move     C0,    C
+    move     A0,    A
+    slli.d   T0,    LDC,   3
+    add.d    C1,    C0,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 3)) goto L_N3_M8 */
+    srai.d   I,     M,     3     /* I = bm >> 3 */
+    beq      ZERO,  I,     .L_N3_M8
+
+.L_N3_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0
+     * U0 = {a1,  a0}
+     * U1 = {a3,  a2}
+     * U2 = {a5,  a4}
+     * U3 = {a7,  a6}
+     */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+    vfmul.d  D2,  U2, U4
+    vfmul.d  D3,  U3, U4
+
+    vldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+    vfmul.d  D6,  U2, U5
+    vfmul.d  D7,  U3, U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_N3_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+    vld     U10,  A0,    0x20
+    vld     U11,  A0,    0x30
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_TL1_END
+
+.L_N3_TL1: /* TL-- */
+    KERNEL8x8x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_TL1
+.L_N3_TL1_END:
+    KERNEL8x8x2_END
+
+.L_N3_L7:
+    /* if (!(L & 7)) goto L_N3_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_L0
+
+.L_N3_L71:
+    /* Load 16 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+    vfmadd.d  D6,  U2, U5, D6
+    vfmadd.d  D7,  U3, U5, D7
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_L71
+
+.L_N3_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+    vfmul.d  D6,   D6,  VALPHA
+    vfmul.d  D7,   D7,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vld      U2,  C0,  0x20
+    vld      U3,  C0,  0x30
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+    vfmadd.d D2,  D2,  VALPHA,  U2
+    vfmadd.d D3,  D3,  VALPHA,  U3
+
+    /* Load C1  */
+    vld      U4,  C1,  0x00
+    vld      U5,  C1,  0x10
+    vld      U6,  C1,  0x20
+    vld      U7,  C1,  0x30
+    vfmadd.d D4,  D4,  VALPHA,  U4
+    vfmadd.d D5,  D5,  VALPHA,  U5
+    vfmadd.d D6,  D6,  VALPHA,  U6
+    vfmadd.d D7,  D7,  VALPHA,  U7
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    vst      D2,  C0,  0x20
+    vst      D3,  C0,  0x30
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+    vst      D6,  C1,  0x20
+    vst      D7,  C1,  0x30
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+    addi.d    C1,  C1,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N3_I1
+
+.L_N3_M8:
+    /* We have done M & 8, considering M=4/2/1 */
+    andi      I,   M,   7
+    beq       ZERO,I,   .L_N3_M0
+
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N3_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+
+    vldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U5
+    vfmul.d  D5,  U1, U5
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M4_L7 */
+    beq       ZERO,TL, .L_N3_M4_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_M4_TL1_END
+
+.L_N3_M4_TL1: /* TL-- */
+    KERNEL8x4x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M4_TL1
+.L_N3_M4_TL1_END:
+    KERNEL8x4x2_END
+
+.L_N3_M4_L7:
+    /* if (!(L & 7)) goto L_N3_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M4_L0
+
+.L_N3_M4_L71:
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D4,  U0, U5, D4
+    vfmadd.d  D5,  U1, U5, D5
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M4_L71
+
+.L_N3_M4_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+    vfmul.d  D5,   D5,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+
+    /* Load C1  */
+    vld      U2,  C1,  0x00
+    vld      U3,  C1,  0x10
+    vfmadd.d D4,  D4,  VALPHA,  U2
+    vfmadd.d D5,  D5,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    /* Store C1 */
+    vst      D4,  C1,  0x00
+    vst      D5,  C1,  0x10
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+    addi.d    C1,  C1,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 4) ) End************/
+
+.L_N3_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N3_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    vld     U0,   A0,    0x00
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    vldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    vfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M2_L7 */
+    beq       ZERO,TL, .L_N3_M2_L7
+
+    vld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    vldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_M2_TL1_END
+
+.L_N3_M2_TL1: /* TL-- */
+    KERNEL8x2x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M2_TL1
+.L_N3_M2_TL1_END:
+    KERNEL8x2x2_END
+
+.L_N3_M2_L7:
+    /* if (!(L & 7)) goto L_N3_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M2_L0
+
+.L_N3_M2_L71:
+    vld     U0,   A0,    0x00
+
+    vldrepl.d U4,  B0, 0x00
+    vldrepl.d U5,  B0, 0x08
+    vfmadd.d  D0,  U0, U4, D0
+
+    vfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M2_L71
+
+.L_N3_M2_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D4,   D4,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+
+    /* Load C1  */
+    vld      U1,  C1,  0x00
+    vfmadd.d D4,  D4,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    vst       D0,  C0,  0x00
+    vst       D4,  C1,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+    addi.d    C1,  C1,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 2) ) End************/
+
+.L_N3_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N3_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x04
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    vldrepl.d   U0,   A0,  0x00
+
+    vld         U4,   B0,  0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_N3_M1_L7
+
+    vldrepl.d  U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vld        U12,  B0,    0x00
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_N3_M1_TL1_END
+
+.L_N3_M1_TL1: /* TL-- */
+    KERNEL8x1x2
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N3_M1_TL1
+.L_N3_M1_TL1_END:
+    KERNEL8x1x2_END
+
+.L_N3_M1_L7:
+    /* if (!(L & 7)) goto L_N3_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N3_M1_L0
+
+.L_N3_M1_L71:
+    vldrepl.d  U0,  A0,    0x00
+
+    vld        U4,  B0,    0x00
+    vfmadd.d   D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N3_M1_L71
+
+.L_N3_M1_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C1,  0x00
+    vilvl.d  U2,  U1,  U0
+    vfmadd.d D0,  D0,  VALPHA,  U2
+#endif // #if defined(TRMMKERNEL)
+
+    vstelm.d D0,  C0,  0x00,  0x00
+    vstelm.d D0,  C1,  0x00,  0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+    addi.d    C1,  C1,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -2
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x04
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 2 ) && (M & 1) ) End************/
+
+.L_N3_M0:
+    /* Add stride for B and C
+     * B += (K * 16)
+     * C += (LDC * 16)
+     */
+    /* since the array type is double,
+     * so we must mul 16
+     */
+    slli.d    T0,   K,   4
+    slli.d    T1,   LDC, 4
+    add.d     B,    B,   T0
+    add.d     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d    OFF,  OFF, 0x02
+#endif
+
+    /* We must reinit I */
+    srai.d   I,     M,   4     /* I = bm >> 4 */
+
+/************************* Condition 2 if((N & 2) && (M >> 3)) End !!! *************************
+*                                                   dgemm_core_16x2                                */
+
+.L_N1:
+    andi     J,    N,   1
+    beq      ZERO, J,   .L_N0
+
+/************************* Condition 3 if((N & 1) && (M >> 3)) START !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+    move     C0,    C
+    move     A0,    A
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 3)) goto L_N1_M8 */
+    srai.d   I,     M,     3     /* I = bm >> 3 */
+    beq      ZERO,  I,     .L_N1_M8
+
+.L_N1_I1:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x06
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  8
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 8 * 64 from A0
+     * U0 = {a3,  a2}
+     * U1 = {a1,  a0}
+     * U2 = {a5,  a4}
+     * U3 = {a7,  a6}
+     */
+
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d   U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+    vfmul.d  D2,  U2, U4
+    vfmul.d  D3,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x40
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_L7 */
+    beq       ZERO,TL, .L_N1_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+    vld     U10,  A0,    0x20
+    vld     U11,  A0,    0x30
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_TL1_END
+.L_N1_TL1: /* TL-- */
+    KERNEL8x8x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_TL1
+.L_N1_TL1_END:
+    KERNEL8x8x1_END
+
+.L_N1_L7:
+    /* if (!(L & 7)) goto L_N1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_L0
+
+.L_N1_L71:
+    /* Load 16 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+    vld     U2,   A0,    0x20
+    vld     U3,   A0,    0x30
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+    vfmadd.d  D2,  U2, U4, D2
+    vfmadd.d  D3,  U3, U4, D3
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_L71
+
+.L_N1_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+    vfmul.d  D2,   D2,  VALPHA
+    vfmul.d  D3,   D3,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vld      U2,  C0,  0x20
+    vld      U3,  C0,  0x30
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+    vfmadd.d D2,  D2,  VALPHA,  U2
+    vfmadd.d D3,  D3,  VALPHA,  U3
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+    vst      D2,  C0,  0x20
+    vst      D3,  C0,  0x30
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x40
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -8
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x06
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_I1
+
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   7
+    beq       ZERO,I,   .L_N1_M0
+
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N1_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x05
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  4
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 4 * 64 from A0 */
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+    vfmul.d  D1,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x20
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M4_L7 */
+    beq       ZERO,TL, .L_N1_M4_L7
+
+    vld     U8,   A0,    0x00
+    vld     U9,   A0,    0x10
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M4_TL1_END
+
+.L_N1_M4_TL1: /* TL-- */
+    KERNEL8x4x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M4_TL1
+.L_N1_M4_TL1_END:
+    KERNEL8x4x1_END
+
+.L_N1_M4_L7:
+    /* if (!(L & 7)) goto L_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M4_L0
+
+.L_N1_M4_L71:
+    vld     U0,   A0,    0x00
+    vld     U1,   A0,    0x10
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+    vfmadd.d  D1,  U1, U4, D1
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M4_L71
+
+.L_N1_M4_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+    vfmul.d  D1,   D1,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vld      U1,  C0,  0x10
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+    vfmadd.d D1,  D1,  VALPHA,  U1
+#endif // #if defined(TRMMKERNEL)
+
+    /* Store C0 */
+    vst      D0,  C0,  0x00
+    vst      D1,  C0,  0x10
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x20
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -4
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x05
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1) && (M & 4) ) End************/
+
+.L_N1_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N1_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x04
+    add.d    A0,    A0,   T0
+    slli.d   T0,    OFF,  0x03
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  2
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 2 * 64 from A0 */
+    vld     U0,   A0,    0x00
+
+    vldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x10
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M2_L7 */
+    beq       ZERO,TL, .L_N1_M2_L7
+
+    vld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M2_TL1_END
+
+.L_N1_M2_TL1: /* TL-- */
+    KERNEL8x2x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M2_TL1
+.L_N1_M2_TL1_END:
+    KERNEL8x2x1_END
+
+.L_N1_M2_L7:
+    /* if (!(L & 7)) goto L_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M2_L0
+
+.L_N1_M2_L71:
+    vld     U0,   A0,    0x00
+
+    vldrepl.d U4,  B0, 0x00
+    vfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M2_L71
+
+.L_N1_M2_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    vld      U0,  C0,  0x00
+    vfmadd.d D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    vstelm.d D0,  C0,  0x00,    0x00
+    vstelm.d D0,  C0,  0x08,    0x01
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x10
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -2
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x04
+    add.d     A0,  A0, T0
+    slli.d    T0,  L,  0x03
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 2) ) End************/
+
+.L_N1_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N1_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    slli.d   T0,    OFF,  0x03
+    add.d    A0,    A0,   T0
+    add.d    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    addi.d   L,     OFF,  1
+#else
+    /* number of values in B */
+    addi.d   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+
+    /* Load 1 * 64 from A0 */
+    vldrepl.d  U0,   A0,    0x00
+
+    vldrepl.d  U4, B0, 0x00
+    /* line 1 */
+    vfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+
+    vldrepl.d  U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    vldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M1_TL1_END
+
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8x1x1
+
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_TL1_END:
+    KERNEL8x1x1_END
+
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+
+.L_N1_M1_L71:
+    vldrepl.d  U0,   A0,    0x00
+
+    vldrepl.d  U4,  B0, 0x00
+    vfmadd.d   D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+
+.L_N1_M1_L0:
+#if defined(TRMMKERNEL)
+    vfmul.d  D0,   D0,  VALPHA
+#else
+    /* Load C0  */
+    vldrepl.d  U0,  C0,  0x00
+    vfmadd.d  D0,  D0,  VALPHA,  U0 /* D0 = U0 + (D0 * VALPHA) */
+#endif // #if defined(TRMMKERNEL)
+
+    vstelm.d D0,  C0,  0x00,  0x00
+
+    /* Add stride for C */
+    addi.d    C0,  C0,  0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d     L,   K,   OFF
+#ifdef LEFT
+    addi.d    L,   L,   -1
+#else
+    addi.d    L,   L,   -1
+#endif
+    slli.d    T0,  L,  0x03
+    add.d     A0,  A0, T0
+    add.d     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    addi.d    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+/********LOOP (if(N & 1 ) && (M & 1) ) End************/
+
+.L_N1_M0:
+
+/************************* Condition 3 if((N & 1) && (M >> 3)) End !!! *************************
+*                                                   dgemm_core_16x1                                */
+
+.L_N0:
+    /* Restore regs */
+    LDARG    $r23,  $sp,   0
+    LDARG    $r24,  $sp,   8
+    LDARG    $r25,  $sp,   16
+    LDARG    $r26,  $sp,   24
+    LDARG    $r27,  $sp,   32
+    LD       $f24,  $sp,   40
+    LD       $f25,  $sp,   48
+    LD       $f26,  $sp,   56
+    LD       $f27,  $sp,   64
+    LD       $f28,  $sp,   72
+    LD       $f29,  $sp,   80
+    LD       $f30,  $sp,   88
+    LD       $f31,  $sp,   96
+    addi.d   $sp,   $sp,   112
+
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_ncopy_4_lsx.S b/kernel/loongarch64/dgemm_ncopy_4_lsx.S
new file mode 100644
index 000000000..048a49af6
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_4_lsx.S
@@ -0,0 +1,185 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r21
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+    PROLOGUE
+
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO, .L_N2
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    addi.d     J,    J,    -1
+    beq        I,    ZERO, .L_I3
+.L_I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+    addi.d     TD,   TD,   0x40
+
+    GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_I0
+.L_II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_2I3
+.L_2I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00
+    GINTERLACE v, d, D0, D1, U1, U0
+    GST v, , D0, TD, 0x00, D1, TD, 0x10
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+.L_2I3:
+    andi       I,    M,    0x01
+    beq        ZERO, I,    .L_N1
+.L_2II1: /* I-- */
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S2,   0x00
+    fst.d      F0,   TD,   0x00
+    addi.d     I,    I,    -1
+    fst.d      F1,   TD,   0x08
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+    blt        ZERO, I,    .L_2II1
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+.L_N0:
+    jirl       $r0,  $r1,  0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_ncopy_8_lsx.S b/kernel/loongarch64/dgemm_ncopy_8_lsx.S
new file mode 100644
index 000000000..30bebe8df
--- /dev/null
+++ b/kernel/loongarch64/dgemm_ncopy_8_lsx.S
@@ -0,0 +1,283 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r21
+#define TL     $r7
+#define T0     $r6
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+
+    PROLOGUE
+    push_if_used 26, 32
+    move       TD,   DST
+    move       TS,   SRC
+    slli.d     TL,   LDA,  0x03
+    slli.d     T0,   TL,   0x01
+    srai.d     J,    N,    0x03
+    beq        J,    ZERO, .L_N4
+.L_J1:
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x03
+    add.d      S3,   S2,   TL
+    addi.d     J,    J,    -1
+    add.d      S4,   S3,   TL
+    add.d      S5,   S3,   T0
+    add.d      S6,   S4,   T0
+    add.d      S7,   S5,   T0
+    add.d      S8,   S6,   T0
+    add.d      TS,   S7,   T0
+    beq        I,    ZERO, .L_I7
+.L_I1:
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00, \
+             U4, S5, 0x00, U5, S6, 0x00, U6, S7, 0x00, U7, S8, 0x00
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+    GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10, \
+             U4, S5, 0x10, U5, S6, 0x10, U6, S7, 0x10, U7, S8, 0x10
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+    GLD v, , U0, S1, 0x20, U1, S2, 0x20, U2, S3, 0x20, U3, S4, 0x20, \
+             U4, S5, 0x20, U5, S6, 0x20, U6, S7, 0x20, U7, S8, 0x20
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+    GLD v, , U0, S1, 0x30, U1, S2, 0x30, U2, S3, 0x30, U3, S4, 0x30, \
+             U4, S5, 0x30, U5, S6, 0x30, U6, S7, 0x30, U7, S8, 0x30
+    GINTERLACE v, d, D0, D4, U1, U0
+    GINTERLACE v, d, D1, D5, U3, U2
+    GINTERLACE v, d, D2, D6, U5, U4
+    GINTERLACE v, d, D3, D7, U7, U6
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30, \
+             D4, TD, 0x40, D5, TD, 0x50, D6, TD, 0x60, D7, TD, 0x70
+    addi.d    TD,    TD,   0x80
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     S5,   S5,   0x40
+    addi.d     S6,   S6,   0x40
+    addi.d     S7,   S7,   0x40
+    addi.d     S8,   S8,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+.L_I7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_I0
+.L_II1: /* I-- */
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+    fld.d     F4,    S5,  0x00
+    fld.d     F5,    S6,  0x00
+    fld.d     F6,    S7,  0x00
+    fld.d     F7,    S8,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+    fst.d     F4,    TD,  0x20
+    addi.d    S5,    S5,  0x08
+    fst.d     F5,    TD,  0x28
+    addi.d    S6,    S6,  0x08
+    fst.d     F6,    TD,  0x30
+    addi.d    S7,    S7,  0x08
+    fst.d     F7,    TD,  0x38
+    addi.d    S8,    S8,  0x08
+    addi.d    TD,    TD,  0x40
+
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_II1
+.L_I0:
+    blt       ZERO,  J,   .L_J1
+.L_N4:
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x02
+    add.d      S3,   S2,   TL
+    add.d      S4,   S2,   T0
+    add.d      TS,   S3,   T0
+    beq        I,    ZERO, .L_I3
+.L_4I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00, U2, S3, 0x00, U3, S4, 0x00
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+    addi.d     TD,   TD,   0x40
+
+    GLD v, , U0, S1, 0x10, U1, S2, 0x10, U2, S3, 0x10, U3, S4, 0x10
+    GINTERLACE v, d, D0, D2, U1, U0
+    GINTERLACE v, d, D1, D3, U3, U2
+    GST v, , D0, TD, 0x00, D1, TD, 0x10, D2, TD, 0x20, D3, TD, 0x30
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_4I1
+.L_I3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N2
+.L_4II1:
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+    fld.d     F2,    S3,  0x00
+    fld.d     F3,    S4,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    fst.d     F2,    TD,  0x10
+    addi.d    S3,    S3,  0x08
+    fst.d     F3,    TD,  0x18
+    addi.d    S4,    S4,  0x08
+
+    addi.d    TD,    TD,  0x20
+    addi.d    I,     I,   -1
+    blt       ZERO,  I,   .L_4II1
+.L_N2:
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    srai.d     I,    M,    0x01
+    add.d      TS,   S2,   TL
+    beq        I,    ZERO, .L_NI1
+.L_2I1: /* I-- */
+    GLD v, , U0, S1, 0x00, U1, S2, 0x00
+    GINTERLACE v, d, D0, D1, U1, U0
+    GST v, , D0, TD, 0x00, D1, TD, 0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_2I1
+.L_NI1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+    fld.d     F0,    S1,  0x00
+    fld.d     F1,    S2,  0x00
+
+    fst.d     F0,    TD,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F1,    TD,  0x08
+    addi.d    S2,    S2,  0x08
+    addi.d    TD,    TD,  0x10
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_M1:
+    fld.d     F0,    S1,  0x00
+    addi.d    S1,    S1,  0x08
+    fst.d     F0,    TD,  0x00
+    addi.d    TD,    TD,  0x08
+    addi.d    M,     M,   -1
+    blt       ZERO,  M,   .L_M1
+.L_N0:
+    pop_if_used 26, 32
+    jirl       $r0,  $r1,  0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_4_lsx.S b/kernel/loongarch64/dgemm_tcopy_4_lsx.S
new file mode 100644
index 000000000..134066471
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_4_lsx.S
@@ -0,0 +1,280 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define P0     $r16
+#define P1     $r17
+#define P2     $r18
+#define P3     $r19
+#define T0     $r20
+#define T1     $r23
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+
+    PROLOGUE
+    push_if_used 18, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    // Find P0, P2, P3
+    srai.d     T0,     N,     0x02
+    slli.d     T0,     T0,    0x02
+    srai.d     T1,     N,     0x01
+    slli.d     T1,     T1,    0x01
+    mul.d      T0,     M,     T0
+    mul.d      T1,     M,     T1
+    slli.d     T0,     T0,    0x03
+    slli.d     T1,     T1,    0x03
+    add.d      P2,     DST,   T0
+    add.d      P3,     DST,   T1
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x02
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x05
+    beq        ZERO,   J,     .L_M3
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x02
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N3
+.L_I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+    vld       U4,     S3,    0x00
+    vld       U5,     S3,    0x10
+    vld       U6,     S4,    0x00
+    vld       U7,     S4,    0x10
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    add.d      P1,     P1,    T1
+
+    addi.d     I,      I,    -1
+    blt        ZERO,   I,    .L_I1
+.L_N3:
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+    vld       U2,     S3,    0x00
+    vld       U3,     S4,    0x00
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P2,     P2,    0x40
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P3,    0x00
+    fst.d      F1,     P3,    0x08
+    fst.d      F2,     P3,    0x10
+    fst.d      F3,     P3,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P3,     P3,    0x20
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+
+.L_M3:
+    andi       J,      M,      0x02
+    beq        ZERO,   J,      .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,     TL
+    add.d      S0,     S0,     T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,     0x40
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_2N3
+
+.L_2I1:   /* I-- */
+    vld       U0,     S1,     0x00
+    vld       U1,     S1,     0x10
+    vld       U2,     S2,     0x00
+    vld       U3,     S2,     0x10
+
+    vst       U0,     P1,     0x00
+    vst       U1,     P1,     0x10
+    vst       U2,     P1,     0x20
+    vst       U3,     P1,     0x30
+
+    addi.d     S1,     S1,     0x20
+    addi.d     S2,     S2,     0x20
+    addi.d     I,      I,      -1
+    add.d      P1,     P1,     T1
+
+    blt        ZERO,   I,     .L_2I1
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    vld       U0,     S1,     0x00
+    vld       U1,     S2,     0x00
+
+    vst       U0,     P2,     0x00
+    vst       U1,     P2,     0x10
+
+    addi.d     S1,     S1,     0x10
+    addi.d     S2,     S2,     0x10
+    addi.d     P2,     P2,     0x20
+
+.L_2N1:
+    addi.d     I,      N,      0x01
+    beq        ZERO,   I,      .L_M1
+
+    fld.d      F0,     S1,     0x00
+    fld.d      F1,     S2,     0x00
+
+    fst.d      F0,     P3,     0x00
+    fst.d      F1,     P3,     0x08
+
+    addi.d     S1,     S1,     0x08
+    addi.d     S2,     S2,     0x08
+    addi.d     P3,     P3,     0x10
+.L_M1:
+    andi       J,      M,      0x01
+    beq        ZERO,   J,      .L_M0
+
+    move       S1,     S0
+    move       P1,     P0
+
+    srai.d     I,      N,      0x02
+    beq        ZERO,   I,      .L_1N3
+
+.L_1I1:
+    vld       U0,    S1,      0x00
+    vld       U1,    S1,      0x10
+
+    vst       U0,    P1,      0x00
+    vst       U1,    P1,      0x10
+
+    addi.d     S1,    S1,      0x20
+    addi.d     I,     I,       -1
+    add.d      P1,    P1,      T1
+
+    blt        ZERO,  I,       .L_1I1
+
+.L_1N3:
+    andi       I,     N,       0x02
+    beq        I,     ZERO,    .L_1N1
+
+    fld.d      F0,    S1,      0x00
+    fld.d      F1,    S1,      0x08
+
+    fst.d      F0,    P2,      0x00
+    fst.d      F1,    P2,      0x08
+
+    addi.d     S1,    S1,      0x10
+    addi.d     P2,    P2,      0x10
+
+.L_1N1:
+    andi       I,     N,       0x01
+    beq        I,     ZERO,    .L_M0
+
+    fld.d      F0,    S1,      0x00
+
+    fst.d      F0,    P3,      0x00
+
+.L_M0:
+    pop_if_used 18, 8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemm_tcopy_8_lsx.S b/kernel/loongarch64/dgemm_tcopy_8_lsx.S
new file mode 100644
index 000000000..a7e3ef69c
--- /dev/null
+++ b/kernel/loongarch64/dgemm_tcopy_8_lsx.S
@@ -0,0 +1,597 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+
+    PROLOGUE
+    push_if_used 24, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    srai.d     T0,     N,     0x03
+    srai.d     T1,     N,     0x02
+    slli.d     T0,     T0,    0x03
+    slli.d     T1,     T1,    0x02
+    mul.d      P2,     M,     T0
+    mul.d      P3,     M,     T1
+    slli.d     P2,     P2,    0x03
+    slli.d     P3,     P3,    0x03
+    add.d      P2,     DST,   P2
+    add.d      P3,     DST,   P3
+
+    srai.d     T0,     N,     0x01
+    slli.d     T0,     T0,    0x01
+    mul.d      P4,     M,     T0
+    slli.d     P4,     P4,    0x03
+    add.d      P4,     DST,   P4
+
+    slli.d     TL,     LDA,   0x03
+    srai.d     J,      M,     0x03
+    slli.d     T0,     TL,    0x01
+    slli.d     T1,     M,     0x06
+    beq        ZERO,   J,     .L_M7
+.L_J1: /* J-- */
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S5,     S3,    T0
+    add.d      S6,     S4,    T0
+    add.d      S7,     S5,    T0
+    add.d      S8,     S6,    T0
+    add.d      S0,     S7,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x200
+
+    srai.d     I,      N,     0x03
+    addi.d     J,      J,     -1
+    beq        ZERO,   I,     .L_N7
+
+.L_I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    vld       U0,     S3,    0x00
+    vld       U1,     S3,    0x10
+    vld       U2,     S3,    0x20
+    vld       U3,     S3,    0x30
+    vld       U4,     S4,    0x00
+    vld       U5,     S4,    0x10
+    vld       U6,     S4,    0x20
+    vld       U7,     S4,    0x30
+
+    vst       U0,     P1,    0x80
+    vst       U1,     P1,    0x90
+    vst       U2,     P1,    0xa0
+    vst       U3,     P1,    0xb0
+    vst       U4,     P1,    0xc0
+    vst       U5,     P1,    0xd0
+    vst       U6,     P1,    0xe0
+    vst       U7,     P1,    0xf0
+
+    vld       U0,     S5,    0x00
+    vld       U1,     S5,    0x10
+    vld       U2,     S5,    0x20
+    vld       U3,     S5,    0x30
+    vld       U4,     S6,    0x00
+    vld       U5,     S6,    0x10
+    vld       U6,     S6,    0x20
+    vld       U7,     S6,    0x30
+
+    vst       U0,     P1,    0x100
+    vst       U1,     P1,    0x110
+    vst       U2,     P1,    0x120
+    vst       U3,     P1,    0x130
+    vst       U4,     P1,    0x140
+    vst       U5,     P1,    0x150
+    vst       U6,     P1,    0x160
+    vst       U7,     P1,    0x170
+
+    vld       U0,     S7,    0x00
+    vld       U1,     S7,    0x10
+    vld       U2,     S7,    0x20
+    vld       U3,     S7,    0x30
+    vld       U4,     S8,    0x00
+    vld       U5,     S8,    0x10
+    vld       U6,     S8,    0x20
+    vld       U7,     S8,    0x30
+
+    vst       U0,     P1,    0x180
+    vst       U1,     P1,    0x190
+    vst       U2,     P1,    0x1a0
+    vst       U3,     P1,    0x1b0
+    vst       U4,     P1,    0x1c0
+    vst       U5,     P1,    0x1d0
+    vst       U6,     P1,    0x1e0
+    vst       U7,     P1,    0x1f0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     S5,     S5,    0x40
+    addi.d     S6,     S6,    0x40
+    addi.d     S7,     S7,    0x40
+    addi.d     S8,     S8,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_I1
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+    vld       U4,     S3,    0x00
+    vld       U5,     S3,    0x10
+    vld       U6,     S4,    0x00
+    vld       U7,     S4,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+    vst       U4,     P2,    0x40
+    vst       U5,     P2,    0x50
+    vst       U6,     P2,    0x60
+    vst       U7,     P2,    0x70
+
+    vld       U0,     S5,    0x00
+    vld       U1,     S5,    0x10
+    vld       U2,     S6,    0x00
+    vld       U3,     S6,    0x10
+    vld       U4,     S7,    0x00
+    vld       U5,     S7,    0x10
+    vld       U6,     S8,    0x00
+    vld       U7,     S8,    0x10
+
+    vst       U0,     P2,    0x80
+    vst       U1,     P2,    0x90
+    vst       U2,     P2,    0xa0
+    vst       U3,     P2,    0xb0
+    vst       U4,     P2,    0xc0
+    vst       U5,     P2,    0xd0
+    vst       U6,     P2,    0xe0
+    vst       U7,     P2,    0xf0
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     S5,     S5,    0x20
+    addi.d     S6,     S6,    0x20
+    addi.d     S7,     S7,    0x20
+    addi.d     S8,     S8,    0x20
+    addi.d     P2,     P2,    0x100
+
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+    vld       U2,     S3,    0x00
+    vld       U3,     S4,    0x00
+    vld       U4,     S5,    0x00
+    vld       U5,     S6,    0x00
+    vld       U6,     S7,    0x00
+    vld       U7,     S8,    0x00
+
+    vst       U0,     P3,    0x00
+    vst       U1,     P3,    0x10
+    vst       U2,     P3,    0x20
+    vst       U3,     P3,    0x30
+    vst       U4,     P3,    0x40
+    vst       U5,     P3,    0x50
+    vst       U6,     P3,    0x60
+    vst       U7,     P3,    0x70
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     S5,     S5,    0x10
+    addi.d     S6,     S6,    0x10
+    addi.d     S7,     S7,    0x10
+    addi.d     S8,     S8,    0x10
+    addi.d     P3,     P3,    0x80
+
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+    fld.d      F4,     S5,    0x00
+    fld.d      F5,     S6,    0x00
+    fld.d      F6,     S7,    0x00
+    fld.d      F7,     S8,    0x00
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+    fst.d      F2,     P4,    0x10
+    fst.d      F3,     P4,    0x18
+    fst.d      F4,     P4,    0x20
+    fst.d      F5,     P4,    0x28
+
+    fst.d      F6,     P4,    0x30
+    fst.d      F7,     P4,    0x38
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     S5,     S5,    0x08
+    addi.d     S6,     S6,    0x08
+    addi.d     S7,     S7,    0x08
+    addi.d     S8,     S8,    0x08
+    addi.d     P4,     P4,    0x40
+
+.L_N0:
+    blt        ZERO,   J,     .L_J1
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S3,     S1,    T0
+    add.d      S4,     S2,    T0
+    add.d      S0,     S3,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x100
+
+    srai.d     I,      N,     0x03
+    beq        ZERO,   I,     .L_4N7
+.L_4I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    vld       U0,     S3,    0x00
+    vld       U1,     S3,    0x10
+    vld       U2,     S3,    0x20
+    vld       U3,     S3,    0x30
+    vld       U4,     S4,    0x00
+    vld       U5,     S4,    0x10
+    vld       U6,     S4,    0x20
+    vld       U7,     S4,    0x30
+
+    vst       U0,     P1,    0x80
+    vst       U1,     P1,    0x90
+    vst       U2,     P1,    0xa0
+    vst       U3,     P1,    0xb0
+    vst       U4,     P1,    0xc0
+    vst       U5,     P1,    0xd0
+    vst       U6,     P1,    0xe0
+    vst       U7,     P1,    0xf0
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     S3,     S3,    0x40
+    addi.d     S4,     S4,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_4I1
+.L_4N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_4N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+    vld       U4,     S3,    0x00
+    vld       U5,     S3,    0x10
+    vld       U6,     S4,    0x00
+    vld       U7,     S4,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+    vst       U4,     P2,    0x40
+    vst       U5,     P2,    0x50
+    vst       U6,     P2,    0x60
+    vst       U7,     P2,    0x70
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S3,     S3,    0x20
+    addi.d     S4,     S4,    0x20
+    addi.d     P2,     P2,    0x80
+
+.L_4N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_4N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+    vld       U2,     S3,    0x00
+    vld       U3,     S4,    0x00
+
+    vst       U0,     P3,    0x00
+    vst       U1,     P3,    0x10
+    vst       U2,     P3,    0x20
+    vst       U3,     P3,    0x30
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     S3,     S3,    0x10
+    addi.d     S4,     S4,    0x10
+    addi.d     P3,     P3,    0x40
+
+.L_4N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+    fld.d      F2,     S3,    0x00
+    fld.d      F3,     S4,    0x00
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+    fst.d      F2,     P4,    0x10
+    fst.d      F3,     P4,    0x18
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     S3,     S3,    0x08
+    addi.d     S4,     S4,    0x08
+    addi.d     P4,     P4,    0x20
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+    add.d      S0,     S0,    T0
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x80
+
+    srai.d     I,      N,     0x03
+    beq        ZERO,   I,     .L_2N7
+.L_2I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+    vst       U4,     P1,    0x40
+    vst       U5,     P1,    0x50
+    vst       U6,     P1,    0x60
+    vst       U7,     P1,    0x70
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_2I1
+.L_2N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_2N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+    vst       U2,     P2,    0x20
+    vst       U3,     P2,    0x30
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     P2,     P2,    0x40
+
+.L_2N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_2N1
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S2,    0x00
+
+    vst       U0,     P3,    0x00
+    vst       U1,     P3,    0x10
+
+    addi.d     S1,     S1,    0x10
+    addi.d     S2,     S2,    0x10
+    addi.d     P3,     P3,    0x20
+
+.L_2N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    fld.d      F0,     S1,    0x00
+    fld.d      F1,     S2,    0x00
+
+    fst.d      F0,     P4,    0x00
+    fst.d      F1,     P4,    0x08
+
+    addi.d     S1,     S1,    0x08
+    addi.d     S2,     S2,    0x08
+    addi.d     P4,     P4,    0x10
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move       S1,     S0
+    add.d      S2,     S0,    TL
+
+    move       P1,     P0
+    addi.d     P0,     P0,    0x40
+
+    srai.d     I,      N,     0x03
+    beq        ZERO,   I,     .L_1N7
+.L_1I1: /* I-- */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vst       U0,     P1,    0x00
+    vst       U1,     P1,    0x10
+    vst       U2,     P1,    0x20
+    vst       U3,     P1,    0x30
+
+    addi.d     S1,     S1,    0x40
+    addi.d     I,      I,     -1
+    add.d      P1,     P1,    T1
+    blt        ZERO,   I,     .L_1I1
+
+.L_1N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_1N3
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     P2,    0x00
+    vst       U1,     P2,    0x10
+
+    addi.d     S1,     S1,    0x20
+    addi.d     P2,     P2,    0x20
+
+.L_1N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_1N1
+
+    vld       U0,     S1,    0x00
+    vst       U0,     P3,    0x00
+
+    addi.d     S1,     S1,    0x10
+    addi.d     P3,     P3,    0x10
+
+.L_1N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    fld.d      F0,     S1,    0x00
+
+    fst.d      F0,     P4,    0x00
+
+    addi.d     S1,     S1,    0x08
+    addi.d     P4,     P4,    0x08
+.L_M0:
+    pop_if_used 24, 8
+    jirl       $r0,    $r1,   0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S
new file mode 100644
index 000000000..a49bf9bb1
--- /dev/null
+++ b/kernel/loongarch64/dgemv_n_8_lasx.S
@@ -0,0 +1,554 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/14 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M8      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define Y1      $xr11
+#define A0      $xr12
+#define A1      $xr13
+#define A2      $xr14
+#define A3      $xr15
+#define A4      $xr16
+#define A5      $xr17
+#define A6      $xr18
+#define A7      $xr19
+#define A8      $xr20
+#define A9      $xr21
+#define A10     $xr22
+#define A11     $xr23
+#define A12     $xr24
+#define A13     $xr25
+#define A14     $xr26
+#define A15     $xr27
+
+.macro DLOAD_X_8
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
+                   X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro DLOAD_X_4
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro DLOAD_X_2
+    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro DLOAD_X_1
+    GLDREPL xv, d, X0, X, 0x00
+    GMUL xvf, d, X0, X0, VALPHA
+.endm
+
+.macro DLOAD_Y_8
+    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro DLOAD_Y_4
+    GLD xv, , Y0, Y, 0
+.endm
+
+.macro DLOAD_Y_1
+    fld.d  $f10,   Y,  0
+.endm
+
+.macro DSTORE_Y_8
+    GST xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro DSTORE_Y_4
+    GST xv, , Y0, Y, 0
+.endm
+
+.macro DSTORE_Y_1
+    fst.d   $f10,   Y,  0
+.endm
+
+// Unable to use vector load/store ins
+.macro DLOAD_Y_8_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  2
+    fld.d   $f11,   T0, 0
+    fldx.d  $f17,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    fld.d   $f18,   T0, 0
+    fldx.d  $f19,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
+.endm
+
+.macro DLOAD_Y_4_GAP
+    fld.d   $f10,   Y,  0
+    fldx.d  $f13,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.d   $f14,   T0, 0
+    fldx.d  $f15,   T0, INC_Y
+    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3
+.endm
+
+.macro DSTORE_Y_8_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      0
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      3
+.endm
+
+.macro DSTORE_Y_4_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y0,     T0,     0,      3
+.endm
+
+.macro DLOAD_X_8_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X4,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X5,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X6,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X7,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro DLOAD_X_4_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.d  X3,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro DLOAD_X_2_GAP
+    xvldrepl.d  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.d  X1,     T0,     0x00
+    GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro DGEMV_N_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
+                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
+                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1, \
+                  Y0, A8,  X4, Y0, Y1, A9,  X4, Y1, \
+                  Y0, A10, X5, Y0, Y1, A11, X5, Y1, \
+                  Y0, A12, X6, Y0, Y1, A13, X6, Y1, \
+                  Y0, A14, X7, Y0, Y1, A15, X7, Y1
+.endm
+
+.macro DGEMV_N_4x8
+    GLD_INC xv, , 0x20, A0,  PA0, 0, \
+                        A2,  PA1, 0, \
+                        A4,  PA2, 0, \
+                        A6,  PA3, 0, \
+                        A8,  PA4, 0, \
+                        A10, PA5, 0, \
+                        A12, PA6, 0, \
+                        A14, PA7, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, \
+                  Y0, A2,  X1, Y0, \
+                  Y0, A4,  X2, Y0, \
+                  Y0, A6,  X3, Y0, \
+                  Y0, A8,  X4, Y0, \
+                  Y0, A10, X5, Y0, \
+                  Y0, A12, X6, Y0, \
+                  Y0, A14, X7, Y0
+.endm
+
+.macro DGEMV_N_1x8
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
+                        $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, \
+                $f10, $f14, $f3, $f10, \
+                $f10, $f16, $f4, $f10, \
+                $f10, $f18, $f5, $f10, \
+                $f10, $f20, $f6, $f10, \
+                $f10, $f22, $f7, $f10, \
+                $f10, $f24, $f8, $f10, \
+                $f10, $f26, $f9, $f10,
+.endm
+
+.macro DGEMV_N_8x4
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1, \
+                  Y0, A4,  X2, Y0, Y1, A5,  X2, Y1, \
+                  Y0, A6,  X3, Y0, Y1, A7,  X3, Y1
+.endm
+
+.macro DGEMV_N_4x4
+    GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
+
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y0, A2,  X1, Y0, \
+                  Y0, A4,  X2, Y0, Y0, A6,  X3, Y0
+.endm
+
+.macro DGEMV_N_1x4
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \
+                $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10
+.endm
+
+.macro DGEMV_N_8x2
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0
+    GMADD xvf, d, Y0, A0,  X0, Y0, Y1, A1,  X0, Y1, \
+                  Y0, A2,  X1, Y0, Y1, A3,  X1, Y1
+.endm
+
+.macro DGEMV_N_4x2
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
+    GMADD xvf, d, Y0, A0,  X0, Y0, \
+                  Y0, A2,  X1, Y0
+.endm
+
+.macro DGEMV_N_1x2
+    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0
+    GMADD f, d, $f10, $f12, $f2, $f10, \
+                $f10, $f14, $f3, $f10
+.endm
+
+.macro DGEMV_N_1x1
+    fld.d   $f12,    PA0,    0
+    PTR_ADDI PA0,   PA0,    0x08
+    fmadd.d $f10,   $f12,   $f2,    $f10
+.endm
+
+.macro DGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    DLOAD_\X_8
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x8
+    DSTORE_\Y_8
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    PTR_ADDI    K,      K,      8
+    bnez        I,      .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      4
+    beqz        I,      .L_\XW\()_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x8
+    DSTORE_\Y_4
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    PTR_ADDI    K,      K,      4
+.L_\XW\()_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x8
+    DSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    DLOAD_\X_4
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x4
+    DSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_4_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x4
+    DSTORE_\Y_4
+    PTR_ALSL  Y,    INC_Y,  Y,  2
+    PTR_ADDI  K,    K,      4
+.L_\XW\()_N_4_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x4
+    DSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M8
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    DLOAD_\X_2
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+    PTR_SRLI    I,      M,       3
+    beqz    I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    DLOAD_\Y_8
+    DGEMV_N_8x2
+    DSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_2_M_3
+    DLOAD_\Y_4
+    DGEMV_N_4x2
+    DSTORE_\Y_4
+    PTR_ALSL  Y,        INC_Y,      Y,      2
+    PTR_ADDI  K,        K,          4
+.L_\XW\()_N_2_M_3:
+    andi    I,      M,      3
+    beqz    I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x2
+    DSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M8
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     PA1,    PA1,    K_LDA
+    PTR_ALSL    X,      INC_X,  X,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    DLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    DLOAD_\Y_1
+    DGEMV_N_1x1
+    DSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 24 + 4
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    xvreplve0.d     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    DGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    DGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    DGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    DGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 24 + 4
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemv_n_lsx.S b/kernel/loongarch64/dgemv_n_lsx.S
new file mode 100644
index 000000000..9a0141fb1
--- /dev/null
+++ b/kernel/loongarch64/dgemv_n_lsx.S
@@ -0,0 +1,229 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Param */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define ALPHA  $f0
+
+#define YORIG  $r18
+#define T0     $r19
+#define T1     $r20
+#define XX     $r12
+#define YY     $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define IX     $r25
+#define IY     $r26
+#define II     $r27
+#define T2     $r28
+#define T3     $r29
+#define T4     $r30
+
+/* LSX vectors */
+#define U0     $vr11
+#define U1     $vr12
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define VALPHA $vr10
+
+#define a1     $f3
+#define a2     $f4
+#define a3     $f5
+#define a4     $f6
+#define a5     $f7
+#define a6     $f8
+#define a7     $f9
+#define a8     $f10
+
+
+    PROLOGUE
+
+    LDARG  INCY,    $sp,  0
+    LDARG  BUFFER,  $sp,  8
+
+    addi.d  $sp, $sp, -80
+
+    SDARG  $r23,  $sp,    0
+    SDARG  $r24,  $sp,    8
+    SDARG  $r25,  $sp,    16
+    SDARG  $r26,  $sp,    32
+    SDARG  $r27,  $sp,    40
+    SDARG  $r28,  $sp,    48
+    SDARG  $r29,  $sp,    56
+    SDARG  $r30,  $sp,    64
+    ST     ALPHA, $sp,    72
+
+    vldrepl.d  VALPHA, $sp, 72
+
+    slli.d     LDA,     LDA,  BASE_SHIFT
+    slli.d     INCX,    INCX, BASE_SHIFT
+    slli.d     INCY,    INCY, BASE_SHIFT
+
+    bge    $r0,    M,    .L999
+    bge    $r0,    N,    .L999
+
+    move   J,      $r0
+    move   IX,     $r0
+
+    move   AO1,    A  //a_ptr
+    move   XX,     X
+    move   YY,     Y
+
+    beq    J,      M,     .L999
+
+.L01:
+    vldx        U0,    XX,    IX
+    vshuf4i.d   U0,    U0,    0x00
+
+    vfmul.d     U1,    VALPHA,  U0  //temp1
+
+    move        IY,    $r0
+    move        II,    $r0
+    move        I,     $r0
+
+    srai.d      T0,    M,     2  //n/4
+    beq         I,     T0,    .L03
+
+.L02:
+    vldx        U2,    AO1,    II
+    addi.d      II,    II,     16
+    vldx        U7,    AO1,    II
+
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+    add.d       T3,    T2,    INCY
+    add.d       T4,    T3,    INCY
+
+    fldx.d      a1,    YY,    T1
+    fldx.d      a2,    YY,    T2
+    fldx.d      a3,    YY,    T3
+    fldx.d      a4,    YY,    T4
+
+    vextrins.d  U3,    U4,    0x10
+    vextrins.d  U5,    U6,    0x10
+
+    vfmadd.d    U3,    U1,    U2,    U3
+    vfmadd.d    U5,    U1,    U7,    U5
+
+    vextrins.d  U4,    U3,    0x01
+    vextrins.d  U6,    U5,    0x01
+
+    fstx.d     a1,     YY,    T1
+    fstx.d     a2,     YY,    T2
+    fstx.d     a3,     YY,    T3
+    fstx.d     a4,     YY,    T4
+
+    add.d      IY,     T4,    INCY
+    addi.d     II,     II,    16
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L02
+
+.L03:
+    andi       T0,     M,     2
+    beq        $r0,    T0,    .L04
+
+    addi.d     T1,     $r0,   4
+    mod.d      T1,     M,     T1
+    sub.d      II,     M,     T1
+    slli.d     II,     II,    BASE_SHIFT
+
+    move       T1,    IY
+    add.d      T2,    T1,    INCY
+
+    vldx       U2,    AO1,    II
+
+    fldx.d     a1,    YY,    T1
+    fldx.d     a2,    YY,    T2
+
+    vextrins.d  U3,    U4,    0x10
+
+    vfmadd.d    U3,    U1,    U2,    U3
+
+    vextrins.d  U4,    U3,    0x01
+
+    fstx.d     a1,     YY,    T1
+    fstx.d     a2,     YY,    T2
+
+    add.d      IY,     T2,    INCY
+
+.L04:
+    andi       T0,     M,     1
+    beq        $r0,    T0,    .L05
+
+    addi.d     II,    M,     -1
+    slli.d     II,    II,    BASE_SHIFT
+
+    fldx.d     a1,    AO1,   II
+    fldx.d     a3,    YY,    IY
+
+    fmadd.d    a3,    $f12,  a1,    a3
+
+    fstx.d     a3,    YY,    IY
+
+    add.d      IY,    IY,    INCY
+
+.L05:
+    add.d      AO1,   AO1,   LDA
+    add.d      IX,    IX,    INCX
+
+    addi.d     J,     J,     1
+    blt        J,     N,    .L01
+
+.L999:
+    LDARG  $r23,  $sp,    0
+    LDARG  $r24,  $sp,    8
+    LDARG  $r25,  $sp,    16
+    LDARG  $r26,  $sp,    32
+    LDARG  $r27,  $sp,    40
+    LDARG  $r28,  $sp,    48
+    LDARG  $r29,  $sp,    56
+    LDARG  $r30,  $sp,    64
+    LD     ALPHA, $sp,    72
+    addi.d  $sp, $sp, 80
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S
new file mode 100644
index 000000000..71f942b0f
--- /dev/null
+++ b/kernel/loongarch64/dgemv_t_8_lasx.S
@@ -0,0 +1,481 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/17 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M8      $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define X1      $xr2
+#define A0      $xr3
+#define A1      $xr4
+#define A2      $xr5
+#define A3      $xr6
+#define A4      $xr7
+#define A5      $xr8
+#define A6      $xr9
+#define A7      $xr10
+#define A8      $xr11
+#define A9      $xr12
+#define A10     $xr13
+#define A11     $xr14
+#define A12     $xr15
+#define A13     $xr16
+#define A14     $xr17
+#define A15     $xr18
+#define TP0     $xr19
+#define TP1     $xr20
+#define TP2     $xr21
+#define TP3     $xr22
+#define TP4     $xr23
+#define TP5     $xr24
+#define TP6     $xr25
+#define TP7     $xr26
+#define Y0      $xr3
+#define Y1      $xr4
+#define Y2      $xr5
+#define Y3      $xr6
+#define Y4      $xr7
+#define Y5      $xr8
+#define Y6      $xr9
+#define Y7      $xr10
+
+.macro ZERO_Y8
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
+                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
+.endm
+
+.macro ZERO_Y4
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y2
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro DLOAD_X8
+    GLD xv, , X0, X, 0x00, X1, X, 0x20
+.endm
+
+.macro DLOAD_X4
+    GLD xv, , X0, X, 0x00
+.endm
+
+.macro DLOAD_X8_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+    PTR_ALSL    T0,     INC_X,      X,      2
+    fld.d       $f2,    T0,   0x00
+    fldx.d      $f3,    T0,   INC_X
+    PTR_ALSL    T0,     INC_X,      T0,     1
+    fld.d       $f4,    T0,   0x00
+    fldx.d      $f5,    T0,   INC_X
+    GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
+.endm
+
+.macro DLOAD_X4_GAP
+    fld.d       $f1,    X,    0x00
+    fldx.d      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.d       $f3,    T0,   0x00
+    fldx.d      $f4,    T0,   INC_X
+    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
+.endm
+
+.macro DGEMV_T_8x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0, \
+    A8,  PA4, 0, A9,  PA4, 0, \
+    A10, PA5, 0, A11, PA5, 0, \
+    A12, PA6, 0, A13, PA6, 0, \
+    A14, PA7, 0, A15, PA7, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
+                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
+                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3, \
+                  TP4, A8,  X0, TP4, TP4, A9,  X1, TP4, \
+                  TP5, A10, X0, TP5, TP5, A11, X1, TP5, \
+                  TP6, A12, X0, TP6, TP6, A13, X1, TP6, \
+                  TP7, A14, X0, TP7, TP7, A15, X1, TP7
+.endm
+
+.macro DGEMV_T_8x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0, \
+                        A8,  PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
+                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3, \
+                  TP4, A8,  X0, TP4, TP5, A10, X0, TP5, \
+                  TP6, A12, X0, TP6, TP7, A14, X0, TP7,
+.endm
+
+.macro DGEMV_T_4x8
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1, \
+                  TP2, A4,  X0, TP2, TP2, A5,  X1, TP2, \
+                  TP3, A6,  X0, TP3, TP3, A7,  X1, TP3
+.endm
+
+.macro DGEMV_T_4x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0, A4,  PA2, 0, A6,  PA3, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1, \
+                  TP2, A4,  X0, TP2, TP3, A6,  X0, TP3
+.endm
+
+.macro DGEMV_T_2x8
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A1,  PA0, 0, A2, PA1, 0, A3,  PA1, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP0, A1,  X1, TP0, \
+                  TP1, A2,  X0, TP1, TP1, A3,  X1, TP1
+.endm
+
+.macro DGEMV_T_2x4
+    GLD_INC xv, , 0x20, A0,  PA0, 0, A2,  PA1, 0
+
+    GMADD xvf, d, TP0, A0,  X0, TP0, TP1, A2,  X0, TP1
+.endm
+
+.macro DGEMV_T_LASX XW:req X8:req, X4:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M8
+.L_\XW\()_N_L8:
+    ZERO_Y8
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    DLOAD_\X8
+    DGEMV_T_8x8
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  3
+    bnez        I,  .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      4
+    beqz        I,      .L_\XW\()_M_3
+    DLOAD_\X4
+    DGEMV_T_8x4
+    PTR_ALSL    X,  INC_X,  X,  2
+.L_\XW\()_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
+                 Y5, TP5, Y6, TP6, Y7, TP7
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.d   $f1,    X,      0x00
+    fld.d   $f11,   PA0,    0x00
+    fld.d   $f12,   PA1,    0x00
+    fld.d   $f13,   PA2,    0x00
+    fld.d   $f14,   PA3,    0x00
+    fld.d   $f15,   PA4,    0x00
+    fld.d   $f16,   PA5,    0x00
+    fld.d   $f17,   PA6,    0x00
+    fld.d   $f18,   PA7,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#else
+    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
+               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
+#endif
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
+                $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    PY1, INC_Y,  Y,  2
+    fld.d   $f15,   PY1,    0x00
+    fldx.d  $f16,   PY1,    INC_Y
+    PTR_ALSL    PY2, INC_Y,  PY1, 1
+    fld.d   $f17,   PY2,    0x00
+    fldx.d  $f18,   PY2,    INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \
+                $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    fst.d   $f15,   PY1,    0x00
+    fstx.d  $f16,   PY1,    INC_Y
+    fst.d   $f17,   PY2,    0x00
+    fstx.d  $f18,   PY2,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    ZERO_Y4
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    DLOAD_\X8
+    DGEMV_T_4x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_4_M_3
+    DLOAD_\X4
+    DGEMV_T_4x4
+    PTR_ALSL  X,      INC_X,  X,  2
+.L_\XW\()_N_4_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    fld.d   $f1,    X,      0x00
+    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.d   $f13,   PY0,    0x00
+    fldx.d  $f14,   PY0,    INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14
+
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M8
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    fst.d   $f13,   PY0,    0x00
+    fstx.d  $f14,   PY0,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    ZERO_Y2
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    DLOAD_\X8
+    DGEMV_T_2x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      4
+    beqz    I,      .L_\XW\()_N_2_M_3
+    DLOAD_\X4
+    DGEMV_T_2x4
+    PTR_ALSL  X,        INC_X,      X,      2
+.L_\XW\()_N_2_M_3:
+    // Accumulated
+    GACC xvf, d, Y0, TP0, Y1, TP1
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    fld.d   $f1,    X,      0x00
+    GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00
+    GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    fld.d   $f11,   Y,  0x00
+    fldx.d  $f12,   Y,  INC_Y
+
+    GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12
+
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M8
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    fst.d   $f11,   Y,      0x00
+    fstx.d  $f12,   Y,      INC_Y
+    PTR_ALSL        Y,      INC_Y,  Y,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.d   $f3,    PA0,    0x00
+    fld.d   $f1,    X,      0x00
+    fmadd.d $f19,   $f3,    $f1,    $f19
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x08
+    bnez      I,      .L_\XW\()_N_1_M_L1
+    fld.d     $f3,    Y,      0x00
+    fmadd.d   $f3,    ALPHA,  $f19,  $f3
+    fst.d     $f3,    Y,      0x00
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 24 + 3
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
+    xvreplve0.d     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    DGEMV_T_LASX GAP_0, X8, X4
+.L_GAP_1: /* if (incx != 1) */
+    DGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 24 + 3
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dgemv_t_lsx.S b/kernel/loongarch64/dgemv_t_lsx.S
new file mode 100644
index 000000000..76f0d9bdc
--- /dev/null
+++ b/kernel/loongarch64/dgemv_t_lsx.S
@@ -0,0 +1,279 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Param */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define ALPHA  $f0
+
+#define YORIG  $r18
+#define T0     $r19
+#define T1     $r20
+#define AO3    $r12
+#define AO4    $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define IX     $r25
+#define IY     $r26
+#define II     $r27
+#define T2     $r28
+#define T3     $r29
+#define T4     $r30
+
+/* LSX vectors */
+#define U0     $vr11
+#define U1     $vr12
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define VALPHA $vr10
+
+#define a1     $f3
+#define a2     $f4
+#define a3     $f5
+#define a4     $f6
+#define a5     $f7
+#define a6     $f8
+#define a7     $f9
+#define a8     $f10
+
+
+    PROLOGUE
+
+    LDARG  INCY,    $sp,  0
+    LDARG  BUFFER,  $sp,  8
+
+    addi.d  $sp, $sp, -80
+
+    SDARG  $r23,  $sp,    0
+    SDARG  $r24,  $sp,    8
+    SDARG  $r25,  $sp,    16
+    SDARG  $r26,  $sp,    32
+    SDARG  $r27,  $sp,    40
+    SDARG  $r28,  $sp,    48
+    SDARG  $r29,  $sp,    56
+    SDARG  $r30,  $sp,    64
+    ST     ALPHA, $sp,    72
+
+    vldrepl.d  VALPHA, $sp, 72
+
+    slli.d     LDA,     LDA,  BASE_SHIFT
+    slli.d     INCX,    INCX, BASE_SHIFT
+    slli.d     INCY,    INCY, BASE_SHIFT
+
+    bge    $r0,    M,    .L999
+    bge    $r0,    N,    .L999
+
+    move   J,      $r0
+    move   IY,     $r0
+
+    move   AO1,    A  //a_ptr1
+
+    srai.d     T0,     N,     2  //n/4
+    beq        J,      T0,    .L04
+
+.L01: /* j<n/4 */
+    vxor.v    U0,     U0,   U0
+    vxor.v    U7,     U7,   U7
+    add.d    AO2,    AO1,  LDA
+    add.d    AO3,    AO2,  LDA
+    add.d    AO4,    AO3,  LDA
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,     $r0
+
+    beq       $r0,    M,    .L03
+
+.L02:  /* i<m */
+    vldx        U1,    X,    IX
+    fldx.d      $f2,   AO1,   II
+    fldx.d      $f3,   AO2,   II
+    fldx.d      $f4,   AO3,   II
+    fldx.d      $f5,   AO4,   II
+
+    vshuf4i.d   U1,    U1,    0x00
+    vextrins.d  U2,    U3,    0x10
+    vextrins.d  U4,    U5,    0x10
+
+    vfmadd.d    U0,    U2,    U1,    U0  //temp1,2
+    vfmadd.d    U7,    U4,    U1,    U7  //temp3,4
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    8
+    addi.d     I,      I,     1
+    blt        I,      M,    .L02
+
+.L03:
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+    add.d       T3,    T2,    INCY
+    add.d       T4,    T3,    INCY
+
+    fldx.d      $f3,    Y,    T1
+    fldx.d      $f4,    Y,    T2
+    fldx.d      $f5,    Y,    T3
+    fldx.d      $f6,    Y,    T4
+
+    vextrins.d  U3,    U4,    0x10
+    vextrins.d  U5,    U6,    0x10
+
+    vfmadd.d    U3,    VALPHA,    U0,    U3
+    vfmadd.d    U5,    VALPHA,    U7,    U5
+
+    vextrins.d  U4,    U3,    0x01
+    vextrins.d  U6,    U5,    0x01
+
+    fstx.d     $f3,     Y,    T1
+    fstx.d     $f4,     Y,    T2
+    fstx.d     $f5,     Y,    T3
+    fstx.d     $f6,     Y,    T4
+
+    slli.d     T1,     LDA,     2
+    add.d      AO1,    AO1,     T1
+    add.d      IY,     T4,      INCY
+
+    addi.d     J,      J,     1
+    blt        J,      T0,    .L01
+
+.L04: /* if(n&2) */
+    andi       T0,     N,     2
+    beq        $r0,    T0,    .L07
+
+    vxor.v    U0,     U0,   U0
+
+    add.d    AO2,    AO1,  LDA
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,     $r0
+
+    beq       $r0,    M,    .L06
+
+.L05:  /* i<m */
+    vldx        U1,    X,    IX
+    fldx.d      $f2,   AO1,   II
+    fldx.d      $f3,   AO2,   II
+
+    vshuf4i.d   U1,    U1,    0x00
+    vextrins.d  U2,    U3,    0x10
+
+    vfmadd.d    U0,    U2,    U1,    U0  //temp1,2
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    8
+    addi.d     I,      I,     1
+    blt        I,      M,    .L05
+
+.L06:
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+
+    fldx.d      a1,    Y,    T1
+    fldx.d      a2,    Y,    T2
+
+    vextrins.d  U3,    U4,    0x10
+
+    vfmadd.d    U3,    VALPHA,    U0,    U3
+
+    vextrins.d  U4,    U3,    0x01
+
+    fstx.d     a1,     Y,    T1
+    fstx.d     a2,     Y,    T2
+
+    slli.d     T0,     LDA,     1
+    add.d      AO1,    AO1,     T0
+    add.d      IY,     T2,      INCY
+
+.L07: /* if(n&1) */
+    andi       T0,     N,     1
+    beq        $r0,    T0,    .L999
+
+    MTC       a1,     $r0
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,      $r0
+
+    beq       $r0,    M,    .L09
+
+.L08:  /* i<m */
+    fldx.d      a3,    X,     IX
+    fldx.d      a4,    AO1,   II
+
+    fmadd.d     a1,    a4,    a3,    a1  //temp1
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    8
+    addi.d     I,      I,     1
+    blt        I,      M,    .L08
+
+.L09:
+    fldx.d      a3,    Y,    IY
+
+    fmadd.d    a3,    ALPHA,    a1,    a3
+
+    fstx.d     a3,     Y,    IY
+
+    add.d      AO1,    AO1,     LDA
+    add.d      IY,     IY,      INCY
+
+.L999:
+    LDARG  $r23,  $sp,    0
+    LDARG  $r24,  $sp,    8
+    LDARG  $r25,  $sp,    16
+    LDARG  $r26,  $sp,    32
+    LDARG  $r27,  $sp,    40
+    LDARG  $r28,  $sp,    48
+    LDARG  $r29,  $sp,    56
+    LDARG  $r30,  $sp,    64
+    LD     ALPHA, $sp,    72
+    addi.d  $sp, $sp, 80
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S
index ff937ae53..2160b93a6 100644
--- a/kernel/loongarch64/dnrm2.S
+++ b/kernel/loongarch64/dnrm2.S
@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    MTC  s1, $r0
    bge $r0,    N, .L999
    slli.d INCX, INCX, BASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    move    XX, X
    NOP
    LD a1,  X,   0 * SIZE
diff --git a/kernel/loongarch64/dnrm2_lasx.S b/kernel/loongarch64/dnrm2_lasx.S
new file mode 100644
index 000000000..5a6f7cf1e
--- /dev/null
+++ b/kernel/loongarch64/dnrm2_lasx.S
@@ -0,0 +1,257 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
+#define VX0    $xr15
+#define VX1    $xr16
+#define VM0    $xr17
+#define VM1    $xr18
+#define VM2    $xr13
+#define VM3    $xr14
+#define res1   $xr19
+#define res2   $xr20
+#define VALPHA $xr21
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    xvxor.v   res1, res1, res1
+    xvxor.v   res2, res2, res2
+    xvxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, SIZE
+    slli.d INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    xvld VX0, X, 0
+    xvld VX1, X, 4 * SIZE
+    xvfmaxa.d VM1, VX1, VX0
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t3, 2
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t2, 1
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t3, 2
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    xvpickve.d VX0, VM0, 1
+    xvpickve.d VX1, VM0, 2
+    xvpickve.d VM3, VM0, 3
+    fmaxa.d $f17, $f17, $f14
+    fmaxa.d $f17, $f17, $f15
+    fmaxa.d $f17, $f17, $f16
+    .align 3
+
+.L97:
+    andi I, N, 7
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    xvld VX1, X, 0
+    xvfmaxa.d VM0, VM0, VX1
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, SIZE
+    bne INCX, TEMP, .L120
+    srai.d  I, N, 3
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    xvld VX0, XX, 0
+    xvld VX1, XX, 4 * SIZE
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    srai.d I, N, 3
+    bge $r0, I, .L997
+
+.L121:
+    ld.d t1, XX,  0
+    add.d XX, XX, INCX
+    ld.d t2, XX, 0
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, XX, 0
+    add.d XX, XX, INCX
+    ld.d t2, XX, 0
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    xvfadd.d res1, res1, res2
+    xvpickve.d VX0, res1, 1
+    xvpickve.d VX1, res1, 2
+    xvpickve.d VM2, res1, 3
+    fadd.d $f19, $f19, $f15
+    fadd.d $f19, $f19, $f16
+    fadd.d $f19, $f19, $f13
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f15, XX, 0
+    addi.d I, I, -1
+    fmul.d $f15, $f15, ALPHA
+    fmadd.d $f19, $f15, $f15, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dnrm2_lsx.S b/kernel/loongarch64/dnrm2_lsx.S
new file mode 100644
index 000000000..fce4260e2
--- /dev/null
+++ b/kernel/loongarch64/dnrm2_lsx.S
@@ -0,0 +1,268 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
+#define VX0    $vr15
+#define VX1    $vr16
+#define VM0    $vr17
+#define VM1    $vr18
+#define VM2    $vr13
+#define VM3    $vr14
+#define res1   $vr19
+#define res2   $vr20
+#define VALPHA $vr21
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    vxor.v   res1, res1, res1
+    vxor.v   res2, res2, res2
+    vxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, SIZE
+    slli.d INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmaxa.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmaxa.d VM2, VX1, VX0
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    vfmaxa.d VM0, VX0, VX1
+    .align 3
+
+.L97:
+    andi I, N, 7
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    vld VX1, X, 0
+    vfmaxa.d VM0, VM0, VX1
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, SIZE
+    bne INCX, TEMP, .L120
+    srai.d  I, N, 3
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    vld VX0, XX, 0 * SIZE
+    vld VX1, XX, 2 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    vld VX0, XX, 4 * SIZE
+    vld VX1, XX, 6 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    srai.d I, N, 3
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    ld.d t1, XX,  0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    ld.d t1, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vfmul.d VM3, VX1, VALPHA
+    ld.d t2, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    ld.d t3, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    ld.d t4, XX, 0 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    vfadd.d res1, res1, res2
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d $f15, XX, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d $f15, $f15, ALPHA
+    fmadd.d $f19, $f15, $f15, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S
new file mode 100644
index 000000000..0715b6311
--- /dev/null
+++ b/kernel/loongarch64/dot_lasx.S
@@ -0,0 +1,368 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r17
+#define TEMP   $r18
+
+/* Don't change following FR unless you know the effects. */
+#define s1     $f8
+#define s2     $f9
+#define a1     $f10
+#define b1     $f11
+
+PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT     N,      0(N)
+    LDINT     INCX,   0(INCX)
+    LDINT     INCY,   0(INCY)
+#endif
+
+    /* init $f8 and $f9 to zero */
+    SUB       s1,     s1,      s1
+    SUB       s2,     s2,      s2
+    slli.d    INCX,   INCX,    BASE_SHIFT
+    li.d      TEMP,   SIZE
+    slli.d    INCY,   INCY,    BASE_SHIFT
+    bge       $r0,    N,       .L999
+    bne       INCX,   TEMP,    .L20   /* inc_x=1 */
+    bne       INCY,   TEMP,    .L20   /* inc_y=1 */
+
+    /* !((inc_x == 1) && (inc_y == 1)) */
+
+    /* init $xr8 and $xr9 to zero */
+#ifdef DOUBLE
+    xvldrepl.d $xr0,  X,       0
+#else
+    xvldrepl.w $xr0,  X,       0
+#endif
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr0,    $xr0
+    xvfsub.d  $xr8,   $xr0,    $xr0
+    xvfsub.d  $xr9,   $xr0,    $xr0
+#else
+    XVFSUB    $xr8,   $xr0,    $xr0
+    XVFSUB    $xr9,   $xr0,    $xr0
+#endif
+
+#ifdef DOUBLE
+    srai.d    I,      N,       4
+#else
+    srai.d    I,      N,       5
+#endif
+    bge       $r0,    I,       .L12   /* FLOAT: <32 ; DOUBLE: <16 */
+    .align  3
+.L11:
+    /* FLOAT: 32~ ; DOUBLE: 16~ */
+    xvld      $xr0,   X,       0
+    xvld      $xr1,   X,       32
+    xvld      $xr2,   X,       64
+    xvld      $xr3,   X,       96
+    xvld      $xr4,   Y,       0
+    xvld      $xr5,   Y,       32
+    xvld      $xr6,   Y,       64
+    xvld      $xr7,   Y,       96
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       128
+    addi.d    Y,      Y,       128
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr10,   $xr0
+    xvfcvtl.d.s       $xr11,   $xr4
+    xvfcvth.d.s       $xr12,   $xr0
+    xvfcvth.d.s       $xr13,   $xr4
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr1
+    xvfcvtl.d.s       $xr11,   $xr5
+    xvfcvth.d.s       $xr12,   $xr1
+    xvfcvth.d.s       $xr13,   $xr5
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr2
+    xvfcvtl.d.s       $xr11,   $xr6
+    xvfcvth.d.s       $xr12,   $xr2
+    xvfcvth.d.s       $xr13,   $xr6
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr3
+    xvfcvtl.d.s       $xr11,   $xr7
+    xvfcvth.d.s       $xr12,   $xr3
+    xvfcvth.d.s       $xr13,   $xr7
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+#else
+    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
+    XVFMADD   $xr9,   $xr1,    $xr5,   $xr9
+    XVFMADD   $xr8,   $xr2,    $xr6,   $xr8
+    XVFMADD   $xr9,   $xr3,    $xr7,   $xr9
+#endif
+    bnez      I,      .L11
+    .align  3
+.L12:
+#ifdef DOUBLE
+    andi      I,      N,       0xf
+    srai.d    I,      I,       2
+#else
+    andi      I,      N,       0x1f
+    srai.d    I,      I,       3
+#endif
+    bge       $r0,    I,       .L14   /* DOUBLE: <4 ; FLOAT: <8 */
+    .align  3
+.L13:
+    /* FLOAT: 8~31 ; DOUBLE: 4~15 */
+    xvld      $xr0,   X,       0
+    xvld      $xr4,   Y,       0
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       32
+    addi.d    Y,      Y,       32
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr10,   $xr0
+    xvfcvtl.d.s       $xr11,   $xr4
+    xvfcvth.d.s       $xr12,   $xr0
+    xvfcvth.d.s       $xr13,   $xr4
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+#else
+    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
+#endif
+    bnez      I,      .L13
+    .align  3
+.L14:
+    /* store dot in s1 $f8 */
+#ifdef DSDOT
+    xvfadd.d  $xr8,   $xr8,    $xr9
+    fsub.s    s2,     s2,      s2,  /* set s2 to 0.0 */
+    xvpermi.q $xr0,   $xr8,    0x1
+    vfadd.d   $vr8,   $vr8,    $vr0
+    vpackod.d $vr0,   $vr8,    $vr8
+    vfadd.d   $vr8,   $vr8,    $vr0
+#else
+    XVFADD    $xr8,   $xr8,    $xr9
+    SUB       s2,     s2,      s2   /* set s2 to 0.0 */
+    xvpermi.q $xr0,   $xr8,    0x1
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.d $vr0,   $vr8,    $vr8
+#ifdef DOUBLE
+    VFADD     $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.w $vr0,   $vr8,    $vr8
+    VFADD     $vr8,   $vr8,    $vr0
+#endif /* defined DOUBLE */
+#endif /* defined DSDOT */
+    .align  3
+.L15:
+#ifdef DOUBLE
+    andi      I,      N,       0x3
+#else
+    andi      I,      N,       0x7
+#endif
+    bge       $r0,    I,       .L999  /* =0 */
+   .align  3
+.L16:
+    /* FLOAT: 1~7 ; DOUBLE: 1~3 */
+    LD        a1,     X,       0
+    LD        b1,     Y,       0
+#ifdef DSDOT
+    fcvt.d.s  a1,     a1
+    fcvt.d.s  b1,     b1
+    fmadd.d   s1,     b1,      a1,     s1
+#else
+    MADD      s1,     b1,      a1,     s1
+#endif
+    addi.d    I,      I,       -1
+    addi.d    X,      X,       SIZE
+    addi.d    Y,      Y,       SIZE
+    bnez      I,      .L16
+    b         .L999
+    .align 3
+
+.L20:
+/* !((inc_x == 1) && (inc_y == 1)) */
+    srai.d    I,      N,       3
+#ifdef F_INTERFACE
+    bgez    INCX, .L21
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCX
+    mflo    TEMP
+    dsub    X, X, TEMP
+    .align 3
+
+.L21:
+    bgez    INCY, .L22
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCY
+    mflo    TEMP
+    dsub    Y, Y, TEMP
+    .align 3
+
+.L22:
+#endif
+    bge $r0,    I, .L25  /* <8 */
+    .align 3
+
+.L23:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+    blt $r0,    I, .L23
+    .align 3
+
+.L25:
+    andi    I,  N, 7
+    bge $r0,    I, .L999
+    .align  3
+
+.L26:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+    blt $r0,    I, .L26
+    .align 3
+
+.L999:
+#ifdef DSDOT
+    fadd.d    $f0,    s1,      s2
+#else
+    ADD       $f0,    s1,      s2
+#endif
+    move      $r4,    $r17
+    jirl      $r0,    $r1,     0x0
+
+EPILOGUE
diff --git a/kernel/loongarch64/dot_lsx.S b/kernel/loongarch64/dot_lsx.S
new file mode 100644
index 000000000..f7f613553
--- /dev/null
+++ b/kernel/loongarch64/dot_lsx.S
@@ -0,0 +1,364 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r17
+#define TEMP   $r18
+
+/* Don't change following FR unless you know the effects. */
+#define s1     $f8
+#define s2     $f9
+#define a1     $f10
+#define b1     $f11
+
+PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT     N,      0(N)
+    LDINT     INCX,   0(INCX)
+    LDINT     INCY,   0(INCY)
+#endif
+
+    /* init $f8 and $f9 to zero */
+    SUB       s1,     s1,      s1
+    SUB       s2,     s2,      s2
+    slli.d    INCX,   INCX,    BASE_SHIFT
+    li.d      TEMP,   SIZE
+    slli.d    INCY,   INCY,    BASE_SHIFT
+    bge       $r0,    N,       .L999
+    bne       INCX,   TEMP,    .L20   /* inc_x=1 */
+    bne       INCY,   TEMP,    .L20   /* inc_y=1 */
+
+    /* !((inc_x == 1) && (inc_y == 1)) */
+
+    /* init $vr8 and $vr9 to zero */
+#ifdef DOUBLE
+    vldrepl.d  $vr0,  X,       0
+#else
+    vldrepl.w  $vr0,  X,       0
+#endif
+#ifdef DSDOT
+    vfcvtl.d.s        $vr0,    $vr0
+    vfsub.d   $vr8,   $vr0,    $vr0
+    vfsub.d   $vr9,   $vr0,    $vr0
+#else
+    VFSUB     $vr8,   $vr0,    $vr0
+    VFSUB     $vr9,   $vr0,    $vr0
+#endif
+
+#ifdef DOUBLE
+    srai.d    I,      N,       3
+#else
+    srai.d    I,      N,       4
+#endif
+    bge       $r0,    I,       .L12   /* FLOAT: <16 ; DOUBLE: <8 */
+    .align  3
+.L11:
+    /* FLOAT: 16~ ; DOUBLE: 8~ */
+    vld       $vr0,   X,       0
+    vld       $vr1,   X,       16
+    vld       $vr2,   X,       32
+    vld       $vr3,   X,       48
+    vld       $vr4,   Y,       0
+    vld       $vr5,   Y,       16
+    vld       $vr6,   Y,       32
+    vld       $vr7,   Y,       48
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       64
+    addi.d    Y,      Y,       64
+#ifdef DSDOT
+    vfcvtl.d.s        $vr10,   $vr0
+    vfcvtl.d.s        $vr11,   $vr4
+    vfcvth.d.s        $vr12,   $vr0
+    vfcvth.d.s        $vr13,   $vr4
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr1
+    vfcvtl.d.s        $vr11,   $vr5
+    vfcvth.d.s        $vr12,   $vr1
+    vfcvth.d.s        $vr13,   $vr5
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr2
+    vfcvtl.d.s        $vr11,   $vr6
+    vfcvth.d.s        $vr12,   $vr2
+    vfcvth.d.s        $vr13,   $vr6
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr3
+    vfcvtl.d.s        $vr11,   $vr7
+    vfcvth.d.s        $vr12,   $vr3
+    vfcvth.d.s        $vr13,   $vr7
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+#else
+    VFMADD    $vr8,   $vr0,    $vr4,   $vr8
+    VFMADD    $vr9,   $vr1,    $vr5,   $vr9
+    VFMADD    $vr8,   $vr2,    $vr6,   $vr8
+    VFMADD    $vr9,   $vr3,    $vr7,   $vr9
+#endif
+    bnez      I,      .L11
+    .align  3
+.L12:
+#ifdef DOUBLE
+    andi      I,      N,       0x7
+    srai.d    I,      I,       1
+#else
+    andi      I,      N,       0xf
+    srai.d    I,      I,       2
+#endif
+    bge       $r0,    I,       .L14   /* DOUBLE: <2 ; FLOAT: <4 */
+    .align  3
+.L13:
+    /* FLOAT: 4~15 ; DOUBLE: 2~7 */
+    vld       $vr0,   X,       0
+    vld       $vr4,   Y,       0
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       16
+    addi.d    Y,      Y,       16
+#ifdef DSDOT
+    vfcvtl.d.s        $vr10,   $vr0
+    vfcvtl.d.s        $vr11,   $vr4
+    vfcvth.d.s        $vr12,   $vr0
+    vfcvth.d.s        $vr13,   $vr4
+    vfmadd.d  $vr8,   $vr10,   $vr12,  $vr8
+    vfmadd.d  $vr9,   $vr11,   $vr13,  $vr9
+#else
+    VFMADD    $vr8,   $vr0,    $vr4,   $vr8
+#endif
+    bnez      I,      .L13
+    .align  3
+.L14:
+    /* store dot in s1 $f8 */
+#ifdef DSDOT
+    vfadd.d   $vr8,   $vr8,    $vr9
+    fsub.s    s2,     s2,      s2,  /* set s2 to 0.0 */
+    vpackod.d $vr0,   $vr8,    $vr8
+    vfadd.d   $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr9
+    SUB       s2,     s2,      s2   /* set s2 to 0.0 */
+    vpackod.d $vr0,   $vr8,    $vr8
+#ifdef DOUBLE
+    VFADD     $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.w $vr0,   $vr8,    $vr8
+    VFADD     $vr8,   $vr8,    $vr0
+#endif /* defined DOUBLE */
+#endif /* defined DSDOT */
+    .align  3
+.L15:
+#ifdef DOUBLE
+    andi      I,      N,       0x1
+#else
+    andi      I,      N,       0x3
+#endif
+    bge       $r0,    I,       .L999  /* =0 */
+   .align  3
+.L16:
+    /* DOUBLE: 1 ; FLOAT: 1~3 */
+    LD        a1,     X,       0
+    LD        b1,     Y,       0
+#ifdef DSDOT
+    fcvt.d.s  a1,     a1
+    fcvt.d.s  b1,     b1
+    fmadd.d   s1,     b1,      a1,     s1
+#else
+    MADD      s1,     b1,      a1,     s1
+#endif
+    addi.d    I,      I,       -1
+    addi.d    X,      X,       SIZE
+    addi.d    Y,      Y,       SIZE
+    bnez      I,      .L16
+    b         .L999
+    .align 3
+
+.L20:
+/* !((inc_x == 1) && (inc_y == 1)) */
+    srai.d    I,      N,       3
+#ifdef F_INTERFACE
+    bgez    INCX, .L21
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCX
+    mflo    TEMP
+    dsub    X, X, TEMP
+    .align 3
+
+.L21:
+    bgez    INCY, .L22
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCY
+    mflo    TEMP
+    dsub    Y, Y, TEMP
+    .align 3
+
+.L22:
+#endif
+    bge $r0,    I, .L25  /* <8 */
+    .align 3
+
+.L23:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+    blt $r0,    I, .L23
+    .align 3
+
+.L25:
+    andi    I,  N, 7
+    bge $r0,    I, .L999
+    .align  3
+
+.L26:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+    blt $r0,    I, .L26
+    .align 3
+
+.L999:
+#ifdef DSDOT
+    fadd.d    $f0,    s1,      s2
+#else
+    ADD       $f0,    s1,      s2
+#endif
+    move      $r4,    $r17
+    jirl      $r0,    $r1,     0x0
+
+EPILOGUE
diff --git a/kernel/loongarch64/dscal_lasx.S b/kernel/loongarch64/dscal_lasx.S
new file mode 100644
index 000000000..153662378
--- /dev/null
+++ b/kernel/loongarch64/dscal_lasx.S
@@ -0,0 +1,194 @@
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $xr12
+#define VX1 $xr13
+#define VT0 $xr14
+#define VT1 $xr15
+#define VALPHA $xr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.d.l a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L20   //ALPHA==0
+    fcmp.ceq.d $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA, TEMP
+    move XX, X
+    .align 3
+
+.L10:
+    bge $r0, I, .L32
+    .align 3
+.L11:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    xvfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX1, VALPHA
+    xvstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 3
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L23:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvst VX0, X, 0 * SIZE
+    xvst VX0, X, 4 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L27:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA , TEMP
+    .align 3
+
+.L31:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VT0, VX0, VALPHA
+    xvfmul.d VT1, VX1, VALPHA
+    addi.d I, I, -1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L33:
+    fld.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d a1, ALPHA, a1
+    fst.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dscal_lsx.S b/kernel/loongarch64/dscal_lsx.S
new file mode 100644
index 000000000..55f497752
--- /dev/null
+++ b/kernel/loongarch64/dscal_lsx.S
@@ -0,0 +1,205 @@
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $vr12
+#define VX1 $vr13
+#define VT0 $vr14
+#define VT1 $vr15
+#define VALPHA $vr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    ffint.d.l a1, a1
+    movgr2fr.d a2, TEMP
+    ffint.d.l a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    fcmp.ceq.d $fcc0, ALPHA, a1
+    bcnez $fcc0, .L20   //ALPHA==0
+    fcmp.ceq.d $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA, TEMP
+    move XX, X
+    .align 3
+
+.L10: //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+
+.L11:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    fst.d a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L23:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    vxor.v VX0, VX0, VX0
+    vst VX0, X, 0 * SIZE
+    vst VX0, X, 2 * SIZE
+    vst VX0, X, 4 * SIZE
+    vst VX0, X, 6 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L27:
+    fst.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA , TEMP
+    .align 3
+
+.L31:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vfmul.d VT1, VX1, VALPHA
+    vld VX0, X, 4 * SIZE
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vld VX1, X, 6 * SIZE
+    vst VT0, X, 4 * SIZE
+    vfmul.d VT1, VX1, VALPHA
+    vst VT1, X, 6 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+.L33:
+    fld.d a1, X, 0 * SIZE
+    addi.d I, I, -1
+    fmul.d a1, ALPHA, a1
+    fst.d a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
new file mode 100644
index 000000000..3315daccb
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S
@@ -0,0 +1,1366 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/07/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+// By integrating the dgemm and dsolve processes, the following advantages can be obtained:
+// 1. Avoid the overhead of function calls (by not invoking dgemm_kernel)
+// 2. Reduce the storage and retrieval of C data
+// 3. Vectorization of dsolve
+// GEMM_UNROLL_M x DGEMM_UNROLL_N is 16x4, which is a fairly large size.
+// To achieve finer-grained optimization, 15 scenarios have been addressed:
+// 16x4, 16x2, 16x1, 8x4, 8x2, 8x1, 4x4, 4x2, 4x1, 2x4, 2x2, 2x1, 1x4, 1x2, 1x1.
+
+.macro dsolve_16 N
+// if N = 4 the data layout of C is as follows:
+// U0  U1  U2  U3
+// U4  U5  U6  U7
+// U8  U9  U10 U11
+// U12 U13 U14 U15
+// if N = 2 the dat layout of C is as follows:
+// U0 U1 U2 U3
+// U4 U5 U6 U7
+// if N = 1 the dat layout of C is as follows:
+// U0 U1 U2 U3
+// The matrix A has dimensions of 16x16, and
+// it will be divided into 4 segments for processing.
+
+#define G12 U3
+#define G13 U7
+#define G14 U11
+#define G15 U15
+    GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
+    // A
+    // G12 G13 G14 G15
+    // -----------------
+    // 204             | D9
+    // 220 221         | D8 D7
+    // 236 237 238     | D6 D5 D4
+    // 252 253 254 255 | D3 D2 D1 D0
+    PTR_ADDI T0, A0, 252 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 236 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 220 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 204 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G15,    G15,    D0
+    GNMSUB xvf, d, G14, G15, D1, G14
+    xvfmul.d    G14,    G14,    D4
+    GNMSUB xvf, d, G13, G15, D2, G13, G13, G14, D5, G13
+    xvfmul.d    G13,    G13,    D7
+    GNMSUB xvf, d, G12, G15, D3, G12, G12, G14, D6, G12, G12, G13, D8, G12
+    xvfmul.d    G12,    G12,    D9
+    // Store B
+.if \N == 4
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G12, B0, 48 * 8, G13, B0, 52 * 8, G14, B0, 56 * 8, G15, B0, 60 * 8
+.elseif \N == 2
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr3, B0, 24 * 8, $vr7, B0, 26 * 8, $vr11, B0, 28 * 8, $vr15, B0, 30 * 8
+.elseif \N == 1
+    // x x x x
+    // x x x x
+    // x x x x
+    // b12 b13 b14 b15
+    GST f, d, $f3, B0, 12 * 8, $f7, B0, 13 * 8, $f11, B0, 14 * 8, $f15, B0, 15 * 8
+.endif
+    // Transpose G15 G14 G13 G12
+    GTRANSPOSE4x4_D G12, G13, G14, G15, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // x x x x ... c12 c13 c14 c15
+    // x x x x ... c28 c29 c30 c31
+    // x x x x ... c44 c45 c46 c47
+    // x x x x ... c60 c61 c62 c63
+    GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8, D2, C2, 12 * 8, D3, C3, 12 * 8
+.elseif \N == 2
+    // x x x x ... c12 c13 c14 c15
+    // x x x x ... c28 c29 c30 c31
+    GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8
+.elseif \N == 1
+    // Store C
+    // x x x x ... c12 c13 c14 c15
+    GST xv, , D0, C0, 12 * 8
+.endif
+
+#define G8  U2
+#define G9  U6
+#define G10 U10
+#define G11 U14
+    GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
+    // A
+    // G8  G9  G10  G11
+    // -----------------
+    // 136             | D9
+    // 152 153         | D8  D7
+    // 168 169 170     | D6  D5  D4
+    // 184 185 186 187 | D3  D2  D1  D0
+    // 200 201 202 203 | D15 D14 D13 D12
+    // 216 217 218 219 | D11 D10 D9  D8
+    // 232 233 234 235 | D7  D6  D5  D4
+    // 248 249 250 251 | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 248 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 232 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 216 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 200 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G11, G15, D0,  G11, G10, G15, D1,  G10, G9, G15, D2,  G9, G8, G15, D3,  G8, \
+                   G11, G14, D4,  G11, G10, G14, D5,  G10, G9, G14, D6,  G9, G8, G14, D7,  G8, \
+                   G11, G13, D8,  G11, G10, G13, D9,  G10, G9, G13, D10, G9, G8, G13, D11, G8, \
+                   G11, G12, D12, G11, G10, G12, D13, G10, G9, G12, D14, G9, G8, G12, D15, G8
+    PTR_ADDI T0, A0, 184 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 168 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 152 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 136 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G11,    G11,    D0
+    GNMSUB xvf, d, G10, G11, D1, G10, G9, G11, D2, G9, G8, G11, D3, G8
+    xvfmul.d    G10,    G10,    D4
+    GNMSUB xvf, d, G9, G10, D5, G9, G8, G10, D6, G8
+    xvfmul.d    G9,     G9,     D7
+    GNMSUB xvf, d, G8, G9, D8, G8
+    xvfmul.d    G8,     G8,     D9
+    // Store B
+.if \N == 4
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b32 b33 b34 b34 ... b44 b45 b46 b47
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G8, B0, 32 * 8, G9, B0, 36 * 8, G10, B0, 40 * 8, G11, B0, 44 * 8
+.elseif \N == 2
+    // x x x x ... x x x x
+    // x x x x ... x x x x
+    // b16 b17 b18 b19 b20 b21 b22 b23
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr2, B0, 16 * 8, $vr6, B0, 18 * 8, $vr10, B0, 20 * 8, $vr14, B0, 22 * 8
+.elseif \N == 1
+    // x x x x
+    // x x x x
+    // b8  b9  b10 b11
+    // b12 b13 b14 b15
+    GST f, d, $f2, B0, 8 * 8, $f6, B0, 9 * 8, $f10, B0, 10 * 8, $f14, B0, 11 * 8
+.endif
+    // Transpose G11 G10 G9 G8
+    GTRANSPOSE4x4_D G8, G9, G10, G11, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
+    // x x x x ... c40 c41 c42 c43 c44 c45 c46 c47
+    // x x x x ... c56 c57 c58 c59 c60 c61 c62 c63
+    GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8, D2, C2, 8 * 8, D3, C3, 8 * 8
+.elseif \N == 2
+    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
+    GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8
+.elseif \N == 1
+    // x x x x ... c8  c9  c10 c11 c12 c13 c14 c15
+    GST xv, , D0, C0, 8 * 8
+.endif
+
+#define G4 U1
+#define G5 U5
+#define G6 U9
+#define G7 U13
+    GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, D0, D1
+    // A
+    // G4   G5  G6   G7
+    // ------------------
+    // 68               | D9
+    // 84	85          | D8  D7
+    // 100	101	102     | D6  D5  D4
+    // 116	117	118	119 | D3  D2  D1  D0
+    // 132	133	134	135 | D15 D14 D13 D12
+    // 148	149	150	151 | D11 D10 D9  D8
+    // 164	165	166	167 | D7  D6  D5  D4
+    // 180	181	182	183 | D3  D2  D1  D0
+    // 196	197	198	199 | D15 D14 D13 D12
+    // 212	213	214	215 | D11 D10 D9  D8
+    // 228	229	230	231 | D7  D6  D5  D4
+    // 244	245	246	247 | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 244 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 228 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 212 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 196 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G7, G15, D0,  G7, G6, G15, D1,  G6, G5, G15, D2,  G5, G4, G15, D3,  G4, \
+                   G7, G14, D4,  G7, G6, G14, D5,  G6, G5, G14, D6,  G5, G4, G14, D7,  G4, \
+                   G7, G13, D8,  G7, G6, G13, D9,  G6, G5, G13, D10, G5, G4, G13, D11, G4, \
+                   G7, G12, D12, G7, G6, G12, D13, G6, G5, G12, D14, G5, G4, G12, D15, G4
+    PTR_ADDI  T0, A0, 180 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 164 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 148 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 132 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G7, G11, D0,  G7, G6, G11, D1,  G6, G5, G11, D2,  G5, G4, G11, D3,  G4, \
+                   G7, G10, D4,  G7, G6, G10, D5,  G6, G5, G10, D6,  G5, G4, G10, D7,  G4, \
+                   G7, G9,  D8,  G7, G6, G9,  D9,  G6, G5, G9,  D10, G5, G4, G9,  D11, G4, \
+                   G7, G8,  D12, G7, G6, G8,  D13, G6, G5, G8,  D14, G5, G4, G8,  D15, G4
+    PTR_ADDI T0, A0, 116 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 100 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 84 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 68 * 8
+    GLDREPL xv, d, D9, T0, 0
+    xvfmul.d    G7,     G7,     D0
+    GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
+    xvfmul.d    G6,     G6,     D4
+    GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
+    xvfmul.d    G5,     G5,     D7
+    GNMSUB xvf, d, G4, G5, D8, G4
+    xvfmul.d    G4,     G4,     D9
+    // Store B
+.if \N == 4
+    // x x x x ... x x x x
+    // b16 b17 b18 b19 ... b28 b29 b30 b31
+    // b32 b33 b34 b34 ... b44 b45 b46 b47
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
+.elseif \N == 2
+    // x x x x ... x x x x
+    // b8  b9  b10 b11 b12 b13 b14 b15
+    // b16 b17 b18 b19 b20 b21 b22 b23
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr1, B0, 8 * 8, $vr5, B0, 10 * 8, $vr9, B0, 12 * 8, $vr13, B0, 14 * 8
+.elseif \N == 1
+    // x x x x
+    // b4  b5  b6  b7
+    // b8  b9  b10 b11
+    // b12 b13 b14 b15
+    GST f, d, $f1, B0, 4 * 8, $f5, B0, 5 * 8, $f9, B0, 6 * 8, $f13, B0, 7 * 8
+.endif
+    // Transpose G7 G6 G5 G4
+    GTRANSPOSE4x4_D G4, G5, G6, G7, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
+    // x x x x c36 c37 c38 c39 c40 c41 c42 c43 c44 c45 c46 c47
+    // x x x x c52 c53 c54 c55 c56 c57 c58 c59 c60 c61 c62 c63
+    GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8, D2, C2, 4 * 8, D3, C3, 4 * 8
+.elseif \N == 2
+    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+    // x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
+    GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8
+.elseif \N == 1
+    // x x x x c4  c5  c6  c7  c8  c9  c10 c11 c12 c13 c14 c15
+    GST xv, , D0, C0, 4 * 8
+.endif
+
+#define G0 U0
+#define G1 U4
+#define G2 U8
+#define G3 U12
+    GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, D0, D1
+    // A
+    // G0   G1  G2   G3
+    // ------------------
+    // 0                 | D9
+    // 16	17           | D8  D7
+    // 32	33	34       | D6  D5  D4
+    // 48	49	50	51   | D3  D2  D1  D0
+    // 64	65	66	67   | D15 D14 D13 D12
+    // 80	81	82	83   | D11 D10 D9  D8
+    // 96	97	98	99   | D7  D6  D5  D4
+    // 112	113	114	115  | D3  D2  D1  D0
+    // 128	129	130	131  | D15 D14 D13 D12
+    // 144	145	146	147  | D11 D10 D9  D8
+    // 160	161	162	163  | D7  D6  D5  D4
+    // 176	177	178	179  | D3  D2  D1  D0
+    // 192	193	194	195  | D15 D14 D13 D12
+    // 208	209	210	211  | D11 D10 D9  D8
+    // 224	225	226	227  | D7  D6  D5  D4
+    // 240	241	242	243  | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 240 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 224 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 208 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 192 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G15, D0,  G3, G2, G15, D1,  G2, G1, G15, D2,  G1, G0, G15, D3,  G0, \
+                   G3, G14, D4,  G3, G2, G14, D5,  G2, G1, G14, D6,  G1, G0, G14, D7,  G0, \
+                   G3, G13, D8,  G3, G2, G13, D9,  G2, G1, G13, D10, G1, G0, G13, D11, G0, \
+                   G3, G12, D12, G3, G2, G12, D13, G2, G1, G12, D14, G1, G0, G12, D15, G0
+    PTR_ADDI  T0, A0, 176 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 160 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 144 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 128 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G11, D0,  G3, G2, G11, D1,  G2, G1, G11, D2,  G1, G0, G11, D3,  G0, \
+                   G3, G10, D4,  G3, G2, G10, D5,  G2, G1, G10, D6,  G1, G0, G10, D7,  G0, \
+                   G3, G9,  D8,  G3, G2, G9,  D9,  G2, G1, G9,  D10, G1, G0,  G9, D11, G0, \
+                   G3, G8,  D12, G3, G2, G8,  D13, G2, G1, G8,  D14, G1, G0,  G8, D15, G0
+    PTR_ADDI  T0, A0, 112 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 96 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 80 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 64 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G7, D0,  G3, G2, G7, D1,  G2, G1, G7, D2,  G1, G0, G7, D3,  G0, \
+                   G3, G6, D4,  G3, G2, G6, D5,  G2, G1, G6, D6,  G1, G0, G6, D7,  G0, \
+                   G3, G5, D8,  G3, G2, G5, D9,  G2, G1, G5, D10, G1, G0, G5, D11, G0, \
+                   G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
+    PTR_ADDI T0, A0, 48 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 32 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 16 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 0 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G3,     G3,     D0
+    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
+    xvfmul.d    G2,     G2,     D4
+    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
+    xvfmul.d    G1,     G1,     D7
+    GNMSUB  xvf, d, G0, G1, D8, G0
+    xvfmul.d    G0,     G0,     D9
+    // Store B
+.if \N == 4
+    // b0  b1  b2  b3  ... b12 b13 b14 b15
+    // b16 b17 b18 b19 ... b28 b29 b30 b31
+    // b32 b33 b34 b34 ... b44 b45 b46 b47
+    // b48 b49 b50 b51 ... b60 b61 b62 b63
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
+.elseif \N == 2
+    // b0  b1  b2  b3  b4  b5  b6  b7
+    // b8  b9  b10 b11 b12 b13 b14 b15
+    // b16 b17 b18 b19 b20 b21 b22 b23
+    // b24 b25 b26 b27 b28 b29 b30 b31
+    GST v, , $vr0, B0, 0, $vr4, B0, 2 * 8, $vr8, B0, 4 * 8, $vr12, B0, 6 * 8
+.elseif \N == 1
+    // b0  b1  b2  b3
+    // b4  b5  b6  b7
+    // b8  b9  b10 b11
+    // b12 b13 b14 b15
+    GST f, d, $f0, B0, 0, $f4, B0, 1 * 8, $f8, B0, 2 * 8, $f12, B0, 3 * 8
+.endif
+    // Transpose C3 C2 C1 C0
+    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    // c0  c1  c2  c3  ... c12 c13 c14 c15
+    // c16 c17 c18 c19 ... c28 c29 c30 c31
+    // c32 c33 c34 c34 ... c44 c45 c46 c47
+    // c48 c49 c50 c51 ... c60 c61 c62 c63
+    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
+.elseif \N == 2
+    // c0  c1  c2  c3  ... c12 c13 c14 c15
+    // c16 c17 c18 c19 ... c28 c29 c30 c31
+    GST xv, , D0, C0, 0, D1, C1, 0
+.elseif \N == 1
+    // c0  c1  c2  c3  ... c12 c13 c14 c15
+    GST xv, , D0, C0, 0
+.endif
+
+#undef G0
+#undef G1
+#undef G2
+#undef G3
+#undef G4
+#undef G5
+#undef G6
+#undef G7
+#undef G8
+#undef G9
+#undef G10
+#undef G11
+#undef G12
+#undef G13
+#undef G14
+#undef G15
+.endm
+
+.macro dsolve_8 N
+// if N = 4 the data layout of C is as follows:
+// U0  U1
+// U2  U3
+// U4  U5
+// U6  U7
+// if N = 2 the dat layout of C is as follows:
+// U0  U1
+// U2  U3
+// if N = 1 the dat layout of C is as follows:
+// U0  U1
+// The matrix A has dimensions of 8x8, and
+// it will be divided into 2 segments for processing.
+
+#define G4 U1
+#define G5 U3
+#define G6 U5
+#define G7 U7
+    // Transpose U7 U5 U3 U1
+    GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, D0, D1
+    // A
+    // G4   G5  G6  G7
+    // ---------------
+    // 36              | D9
+    // 44	45         | D8 D7
+    // 52	53	54     | D6 D5 D4
+    // 60	61	62	63 | D3 D2 D1 D0
+    PTR_ADDI      T0,     A0,     60 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI      T0,     A0,     52 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI      T0,     A0,     44 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI      T0,     A0,     36 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G7,     G7,     D0
+    GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
+    xvfmul.d    G6,     G6,     D4
+    GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
+    xvfmul.d    G5,     G5,     D7
+    GNMSUB xvf, d, G4, G5, D8, G4
+    xvfmul.d    G4,     G4,     D9
+    // Store B
+.if \N == 4
+    GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
+.elseif \N == 2
+    GST v, , $vr1, B0, 8 * 8, $vr3, B0, 10 * 8, $vr5, B0, 12 * 8, $vr7, B0, 14 * 8
+.elseif \N == 1
+    GST f, d, $f1, B0, 4 * 8, $f3, B0, 5 * 8, $f5, B0, 6 * 8, $f7, B0, 7 * 8
+.endif
+    // Transpose
+    GTRANSPOSE4x4_D G4, G5, G6, G7, D4, D5, D6, D7, D8, D9
+    // Store C
+.if \N == 4
+    GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8, D6, C2, 4 * 8, D7, C3, 4 * 8
+.elseif \N == 2
+    GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8
+.elseif \N == 1
+    GST xv, , D4, C0, 4 * 8
+.endif
+
+#define G0 U0
+#define G1 U2
+#define G2 U4
+#define G3 U6
+    // Transpose U6 U4 U2 U0
+    GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, D0, D1
+    // A
+    // G0  G1   G2  G3
+    //-----------------
+    // 0               | D9
+    // 8	9          | D8  D7
+    // 16	17	18     | D6  D5  D4
+    // 24	25	26	27 | D3  D2  D1  D0
+    // 32	33	34	35 | D15 D14 D13 D12
+    // 40	41	42	43 | D11 D10 D9  D8
+    // 48	49	50	51 | D7  D6  D5  D4
+    // 56	57	58	59 | D3  D2  D1  D0
+    PTR_ADDI  T0, A0, 56 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI  T0, A0, 48 * 8
+    GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
+    PTR_ADDI  T0, A0, 40 * 8
+    GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
+    PTR_ADDI  T0, A0, 32 * 8
+    GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
+    GNMSUB xvf, d, G3, G7, D0,  G3, G2, G7, D1,  G2, G1, G7, D2,  G1, G0, G7, D3,  G0, \
+                   G3, G6, D4,  G3, G2, G6, D5,  G2, G1, G6, D6,  G1, G0, G6, D7,  G0, \
+                   G3, G5, D8,  G3, G2, G5, D9,  G2, G1, G5, D10, G1, G0, G5, D11, G0, \
+                   G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
+    PTR_ADDI T0, A0, 24 * 8
+    GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
+    PTR_ADDI T0, A0, 16 * 8
+    GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
+    PTR_ADDI T0, A0, 8 * 8
+    GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
+    PTR_ADDI T0, A0, 0 * 8
+    GLDREPL xv, d, D9, T0, 0
+
+    xvfmul.d    G3,     G3,     D0
+    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
+    xvfmul.d    G2,     G2,     D4
+    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
+    xvfmul.d    G1,     G1,     D7
+    GNMSUB xvf, d, G0, G1, D8, G0
+    xvfmul.d    G0,     G0,     D9
+    // Store B
+.if \N == 4
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
+.elseif \N == 2
+    GST v, , $vr0, B0, 0, $vr2, B0, 2 * 8, $vr4, B0, 4 * 8, $vr6, B0, 6 * 8
+.elseif \N == 1
+    GST f, d, $f0, B0, 0, $f2, B0, 1 * 8, $f4, B0, 2 * 8, $f6, B0, 3 * 8
+.endif
+    // Transpose
+    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
+.elseif \N == 2
+    GST xv, , D0, C0, 0, D1, C1, 0
+.elseif \N == 1
+    GST xv, , D0, C0, 0
+.endif
+
+#undef G0
+#undef G1
+#undef G2
+#undef G3
+#undef G4
+#undef G5
+#undef G6
+#undef G7
+.endm
+
+.macro dsolve_4 N
+// if N = 4 the data layout of C is as follows:
+// U0
+// U1
+// U2
+// U3
+// if N = 2 the dat layout of C is as follows:
+// U0
+// U1
+// if N = 1 the dat layout of C is as follows:
+// U0
+// The matrix A has dimensions of 4x4, and
+// it will be divided into 1 segments for processing.
+
+#define G0 U0
+#define G1 U1
+#define G2 U2
+#define G3 U3
+    // Transpose U3 U2 U1 U0
+    GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, D0, D1
+    // A
+    // G0 G1 G2 G3
+    //-------------
+    // 0           | D9
+    // 4  5        | D8 D7
+    // 8  9  10    | D6 D5 D4
+    // 12 13 14 15 | D3 D2 D1 D0
+    GLDREPL xv, d, D3, A0, 12 * 8, D2, A0, 13 * 8, D1, A0, 14 * 8, D0, A0, 15 * 8, \
+                   D6, A0, 8 * 8,  D5, A0, 9 * 8,  D4, A0, 10 * 8, \
+                   D8, A0, 4 * 8,  D7, A0, 5 * 8, \
+                   D9, A0, 0 * 8
+    xvfmul.d    G3,     G3,     D0
+    GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
+    xvfmul.d    G2,     G2,     D4
+    GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
+    xvfmul.d    G1,     G1,     D7
+    GNMSUB xvf, d, G0, G1, D8, G0
+    xvfmul.d    G0,     G0,     D9
+    // Store B
+.if \N == 4
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
+.elseif \N == 2
+    GST v, , $vr0, B0, 0, $vr1, B0, 2 * 8, $vr2, B0, 4 * 8, $vr3, B0, 6 * 8
+.elseif \N == 1
+    GST f, d, $f0, B0, 0, $f1, B0, 1 * 8, $f2, B0, 2 * 8, $f3, B0, 3 * 8
+.endif
+    // Transpose
+    GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
+    // Store C
+.if \N == 4
+    GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
+.elseif \N == 2
+    GST xv, , D0, C0, 0, D1, C1, 0
+.elseif \N == 1
+    GST xv, , D0, C0, 0
+.endif
+
+#undef G0
+#undef G1
+#undef G2
+#undef G3
+.endm
+
+.macro dsolve_2 N
+#define G0  U2
+#define G1  U3
+    // Transpose
+    GSBUTTERFLY xv, d, G0, G1, U1, U0
+    // A
+    // G0 G1
+    // ------
+    // 0    | D2
+    // 2  3 | D1 D0
+    GLDREPL xv, d, D2, A0, 0, D1, A0, 2 * 8, D0, A0, 3 * 8
+    xvfmul.d    G1,     G1,     D0
+    GNMSUB xvf, d, G0, G1, D1, G0
+    xvfmul.d    G0,     G0,     D2
+    // Store B
+.if \N == 4
+    GST xv, , G0, B0, 0, G1, B0, 4 * 8
+.elseif \N == 2
+    GST v, , $vr2, B0, 0, $vr3, B0, 2 * 8
+.elseif \N == 1
+    GST f, d, $f2, B0, 0, $f3, B0, 8
+.endif
+    // Transpose
+    GSBUTTERFLY xv, d, D0, D1, G1, G0
+    // Store C
+.if \N == 4
+    vst       $vr16,    C0,      0x00
+    vst       $vr17,    C1,      0x00
+    xvstelm.d D0,  C2,  0x00,    0x02
+    xvstelm.d D1,  C3,  0x00,    0x02
+    xvstelm.d D0,  C2,  0x08,    0x03
+    xvstelm.d D1,  C3,  0x08,    0x03
+.elseif \N == 2
+    GST v, , $vr16, C0, 0, $vr17, C1, 0
+.elseif \N == 1
+    GST v, , $vr16, C0, 0
+.endif
+
+#undef G0
+#undef G1
+.endm
+
+.macro dgemm_dsolve_16x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
+    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
+    PTR_ADDI    B0,    T2,    -(16 * 4 * 8)
+    dsolve_16 4
+.endm
+
+.macro dgemm_dsolve_1x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+.L_dsolve_1x4:
+    or      A0,     T1,     T1
+    or      B0,     T2,     T2
+    GLDREPL xv, d, D0, A0, -1 * 8
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    xvstelm.d   U0,     C2,     0x00,       0x02
+    xvstelm.d   U0,     C3,     0x00,       0x03
+    // Store B
+    xvst    U0,     B0,     -32
+.endm
+
+.macro dgemm_dsolve_2x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+/********************** solver ******************/
+.L_dsolve_2x4:
+    PTR_ADDI      A0,     T1,     -(2 * 2 * 8)
+    PTR_ADDI      B0,     T2,     -(2 * 4 * 8)
+    dsolve_2 4
+.endm
+
+.macro dgemm_dsolve_4x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
+    PTR_ADDI      B0,     T2,     -(4 * 4 * 8)
+
+    dsolve_4 4
+.endm
+
+.macro dgemm_dsolve_8x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    PTR_ADDI      A0,     T1,     -(8 * 8 * 8)
+    PTR_ADDI      B0,     T2,     -(8 * 4 * 8)
+
+    dsolve_8 4
+.endm
+
+.macro dgemm_dsolve_4x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
+    PTR_ADDI      B0,     T2,     -(4 * 2 * 8)
+
+    dsolve_4 2
+.endm
+
+.macro dgemm_dsolve_2x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    PTR_ADDI     A0,  T1,     -(2 * 2 * 8)
+    PTR_ADDI     B0,  T2,     -(2 * 2 * 8)
+
+    dsolve_2 2
+.endm
+
+.macro dgemm_dsolve_8x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    PTR_ADDI     A0,  T1,     -(8 * 8 * 8)
+    PTR_ADDI     B0,  T2,     -(8 * 2 * 8)
+
+    dsolve_8 2
+.endm
+
+.macro dgemm_dsolve_16x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
+    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
+    PTR_ADDI    B0,    T2,    -(16 * 2 * 8)
+
+    dsolve_16 2
+.endm
+
+.macro dgemm_dsolve_2x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    PTR_ADDI     A0,  T1,     -(2 * 2 * 8)
+    PTR_ADDI     B0,  T2,     -(2 * 1 * 8)
+
+    dsolve_2 1
+.endm
+
+.macro dgemm_dsolve_4x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    PTR_ADDI      A0,     T1,     -(4 * 4 * 8)
+    PTR_ADDI      B0,     T2,     -(4 * 1 * 8)
+
+    dsolve_4 1
+.endm
+
+.macro dgemm_dsolve_8x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    PTR_ADDI     A0,  T1,     -(8 * 8 * 8)
+    PTR_ADDI     B0,  T2,     -(8 * 1 * 8)
+
+    dsolve_8 1
+.endm
+
+.macro dgemm_dsolve_16x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    PTR_ADDI    A0,    T1,    -(16 * 8 * 8)
+    PTR_ADDI    A0,    A0,    -(16 * 8 * 8)
+    PTR_ADDI    B0,    T2,    -(16 * 1 * 8)
+
+    dsolve_16 1
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi     N,     N,     0x03
+    beq      ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI   J,     J,     -1
+    PTR_ADD    KK,    M,     OFFSET
+
+    andi      I,    M,      15
+    beq       ZERO, I,      .L_M16
+    andi      I,    M,      1
+    beqz      I,    .L_M2
+.L_M1:
+    PTR_ADDI    T0,   M,      -1
+    PTR_SLLI    T0,   T0,     3
+    PTR_MUL     AA,   T0,     K
+    PTR_ADD     AA,   AA,     A
+    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
+    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
+
+    PTR_SLLI   T0,    KK,     5
+    PTR_ADD    B0,    B,      T0 /* b + 4 * kk */
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_1x4
+    PTR_ADDI   KK,    KK,     -1
+.L_M2:
+    andi    I,      M,      2
+    beqz    I,      .L_M4
+    PTR_SRLI  T0,     M,      1
+    PTR_SLLI  T0,     T0,     1
+    PTR_ADDI  T0,     T0,     -2
+    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
+    PTR_SLLI  T1,     KK,     4
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
+    PTR_SLLI  T0,     KK,     5
+    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_2x4
+    PTR_ADDI  KK,     KK,     -2
+.L_M4:
+    andi    I,      M,      4
+    beqz    I,      .L_M8
+    PTR_SRLI  T0,     M,      2
+    PTR_SLLI  T0,     T0,     2
+    PTR_ADDI  T0,     T0,     -4
+    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
+    PTR_SLLI  T1,     KK,     5
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
+    PTR_SLLI  T0,     KK,     5
+    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_4x4
+    PTR_ADDI  KK,     KK,     -4
+.L_M8:
+    andi    I,      M,      8
+    beqz    I,      .L_M16
+    PTR_SRLI  T0,     M,      3
+    PTR_SLLI  T0,     T0,     3
+    PTR_ADDI  T0,     T0,     -8
+    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
+    PTR_SLLI  T1,     KK,     6
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
+    PTR_SLLI  T0,     KK,     5
+    PTR_ADD   B0,     B,      T0 /* b + 4 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    dgemm_dsolve_8x4
+    PTR_ADDI  KK,     KK,     -8
+.L_M16:
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M0
+
+    PTR_SRLI   T0,    M,      4
+    PTR_SLLI   T0,    T0,     4
+    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
+    PTR_SLLI   T0,    T0,     3
+    PTR_MUL    AA,    T0,     K
+    PTR_ADD    AA,    A,      AA
+    PTR_ADD    CC,    C,      T0
+.align 5
+.L_I1:
+    PTR_SLLI   T0,    KK,     5
+    PTR_ADD    B0,    B,      T0
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_SLLI   T0,    KK,     7
+    PTR_ADD    A0,    AA,     T0
+    dgemm_dsolve_16x4
+    PTR_ADDI   I,     I,      -1
+    PTR_ADDI   KK,    KK,     -16
+    PTR_ADDI   CC,    CC,     -(16 * 8)
+    PTR_SLLI   T0,    K,      7
+    PTR_SUB    AA,    AA,     T0
+    blt      ZERO,  I,      .L_I1
+.L_M0:
+    PTR_SLLI   T0,    K,      3
+    PTR_ALSL   B,     T0,     B,      2 // b += 4 * k;
+    PTR_ALSL   C,     LDC,    C,      2 // c += 4 * ldc
+    blt      ZERO,  J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+
+    PTR_ADD    KK,    M,     OFFSET
+    andi      I,    M,      15
+    beq       ZERO, I,      .L_N3_M16
+    andi      I,    M,      1
+    beqz      I,    .L_N3_M2
+.L_N3_M1:
+    PTR_ADDI    KK,   KK,     -1
+
+    PTR_ADDI    T0,   M,      -1
+    PTR_SLLI    T0,   T0,     3
+    PTR_MUL     AA,   T0,     K
+    PTR_ADD     AA,   AA,     A
+    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
+    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
+
+    PTR_SLLI   T0,    KK,     4
+    PTR_ADD    B0,    B,      T0 /* b + 2 * kk */
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    // dgemm_dsolve_1x2
+    GLD f, d, $f0, A0, 0, $f1, C0, 0, $f2, C1, 0
+    GMUL f, d, $f1, $f1, $f0, $f2, $f2, $f0
+    GST f, d, $f1, C0, 0, $f2, C1, 0, $f1, B0, 0, $f2, B0, 8
+.L_N3_M2:
+    andi    I,      M,      2
+    beqz    I,      .L_N3_M4
+    PTR_SRLI  T0,     M,      1
+    PTR_SLLI  T0,     T0,     1
+    PTR_ADDI  T0,     T0,     -2
+    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
+    PTR_SLLI  T1,     KK,     4
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
+    PTR_SLLI  T0,     KK,     4
+    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    dgemm_dsolve_2x2
+    PTR_ADDI  KK,     KK,     -2
+.L_N3_M4:
+    andi    I,      M,      4
+    beqz    I,      .L_N3_M8
+    PTR_SRLI  T0,     M,      2
+    PTR_SLLI  T0,     T0,     2
+    PTR_ADDI  T0,     T0,     -4
+    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
+    PTR_SLLI  T1,     KK,     5
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
+    PTR_SLLI  T0,     KK,     4
+    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    dgemm_dsolve_4x2
+    PTR_ADDI  KK,     KK,     -4
+.L_N3_M8:
+    andi    I,      M,      8
+    beqz    I,      .L_N3_M16
+    PTR_SRLI  T0,     M,      3
+    PTR_SLLI  T0,     T0,     3
+    PTR_ADDI  T0,     T0,     -8
+    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
+    PTR_SLLI  T1,     KK,     6
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
+    PTR_SLLI  T0,     KK,     4
+    PTR_ADD   B0,     B,      T0 /* b + 2 * kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    dgemm_dsolve_8x2
+    PTR_ADDI  KK,     KK,     -8
+.L_N3_M16:
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N3_M0
+
+    PTR_SRLI   T0,    M,      4
+    PTR_SLLI   T0,    T0,     4
+    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
+    PTR_SLLI   T0,    T0,     3
+    PTR_MUL    AA,    T0,     K
+    PTR_ADD    AA,    A,      AA
+    PTR_ADD    CC,    C,      T0
+.align 5
+.L_N3_I1:
+    PTR_SLLI   T0,    KK,     4
+    PTR_ADD    B0,    B,      T0
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_SLLI   T0,    KK,     7
+    PTR_ADD    A0,    AA,     T0
+    dgemm_dsolve_16x2
+    PTR_ADDI   I,     I,      -1
+    PTR_ADDI   KK,    KK,     -16
+    PTR_ADDI   CC,    CC,     -(16 * 8)
+    PTR_SLLI   T0,    K,      7
+    PTR_SUB    AA,    AA,     T0
+    blt      ZERO,  I,      .L_N3_I1
+.L_N3_M0:
+    PTR_SLLI   T0,    K,      3
+    PTR_ALSL   B,     T0,     B,      1 // b += 2 * k;
+    PTR_ALSL   C,     LDC,    C,      1 // c += 2 * ldc
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+
+    PTR_ADD    KK,    M,     OFFSET
+    andi      I,    M,      15
+    beq       ZERO, I,      .L_N1_M16
+    andi      I,    M,      1
+    beqz      I,    .L_N1_M2
+.L_N1_M1:
+    PTR_ADDI    KK,   KK,     -1
+
+    PTR_ADDI    T0,   M,      -1
+    PTR_SLLI    T0,   T0,     3
+    PTR_MUL     AA,   T0,     K
+    PTR_ADD     AA,   AA,     A
+    PTR_ALSL    A0,   KK,     AA,     3 /* a + (m - 1) * k + kk */
+    PTR_ADD     CC,   T0,     C         /* c + (m - 1) */
+
+    PTR_SLLI   T0,    KK,     3
+    PTR_ADD    B0,    B,      T0 /* b + kk */
+    GADD , d, C0, CC, ZERO
+    // dgemm_dsolve_1x1
+    GLD f, d, $f0, A0, 0, $f1, C0, 0
+    GMUL f, d, $f1, $f1, $f0
+    GST f, d, $f1, C0, 0, $f1, B0, 0
+.L_N1_M2:
+    andi    I,      M,      2
+    beqz    I,      .L_N1_M4
+    PTR_SRLI  T0,     M,      1
+    PTR_SLLI  T0,     T0,     1
+    PTR_ADDI  T0,     T0,     -2
+    PTR_SLLI  T0,     T0,     3 /* ((m & -2) - 2) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -2) - 2)*/
+    PTR_SLLI  T1,     KK,     4
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -2) - 2) * k + 2 * kk */
+    PTR_SLLI  T0,     KK,     3
+    PTR_ADD   B0,     B,      T0 /* b + kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO
+    dgemm_dsolve_2x1
+    PTR_ADDI  KK,     KK,     -2
+.L_N1_M4:
+    andi    I,      M,      4
+    beqz    I,      .L_N1_M8
+    PTR_SRLI  T0,     M,      2
+    PTR_SLLI  T0,     T0,     2
+    PTR_ADDI  T0,     T0,     -4
+    PTR_SLLI  T0,     T0,     3 /* ((m & -4) - 4) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -4) - 4)*/
+    PTR_SLLI  T1,     KK,     5
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -4) - 4) * k + 4 * kk */
+    PTR_SLLI  T0,     KK,     3
+    PTR_ADD   B0,     B,      T0 /* b + kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO
+    dgemm_dsolve_4x1
+    PTR_ADDI  KK,     KK,     -4
+.L_N1_M8:
+    andi    I,      M,      8
+    beqz    I,      .L_N1_M16
+    PTR_SRLI  T0,     M,      3
+    PTR_SLLI  T0,     T0,     3
+    PTR_ADDI  T0,     T0,     -8
+    PTR_SLLI  T0,     T0,     3 /* ((m & -8) - 8) */
+    PTR_ADD   CC,     T0,     C /* c + ((m & -8) - 8)*/
+    PTR_SLLI  T1,     KK,     6
+    PTR_MUL   AA,     T0,     K
+    PTR_ADD   AA,     AA,     A
+    PTR_ADD   A0,     AA,     T1 /* a + ((m & -8) - 8) * k + 8 * kk */
+    PTR_SLLI  T0,     KK,     3
+    PTR_ADD   B0,     B,      T0 /* b +  kk */
+    PTR_SUB   L,      K,      KK
+    GADD , d, C0, CC, ZERO
+    dgemm_dsolve_8x1
+    PTR_ADDI  KK,     KK,     -8
+.L_N1_M16:
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N1_M0
+
+    PTR_SRLI   T0,    M,      4
+    PTR_SLLI   T0,    T0,     4
+    PTR_ADDI   T0,    T0,     -16 /* ((M & -16)) - 16) */
+    PTR_SLLI   T0,    T0,     3
+    PTR_MUL    AA,    T0,     K
+    PTR_ADD    AA,    A,      AA
+    PTR_ADD    CC,    C,      T0
+.align 5
+.L_N1_I1:
+    PTR_SLLI   T0,    KK,     3
+    PTR_ADD    B0,    B,      T0
+    PTR_SUB    L,     K,      KK
+    GADD , d, C0, CC, ZERO
+    PTR_SLLI   T0,    KK,     7
+    PTR_ADD    A0,    AA,     T0
+    dgemm_dsolve_16x1
+    PTR_ADDI   I,     I,      -1
+    PTR_ADDI   KK,    KK,     -16
+    PTR_ADDI   CC,    CC,     -(16 * 8)
+    PTR_SLLI   T0,    K,      7
+    PTR_SUB    AA,    AA,     T0
+    blt      ZERO,  I,      .L_N1_I1
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
new file mode 100644
index 000000000..0e2cacccf
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_LT_16x4_lasx.S
@@ -0,0 +1,959 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     B0
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define G0     D0
+#define G1     D1
+#define G2     D2
+#define G3     D3
+#define G4     D4
+#define G5     D5
+#define G6     D6
+#define G7     D7
+#define G8     D8
+#define G9     D9
+#define G10    D10
+#define G11    D11
+#define G12    D12
+#define G13    D13
+#define G14    D14
+#define G15    D15
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, A0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+.macro nmsub_macro start0, end0, start1, reg
+// Gx -= reg * Ux
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+.macro B_st_macro start, end, stride, N
+// Store Gx(x = 16...31)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, B0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, B0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, B0, \stride * 0x08
+.endif
+    B_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16 N
+// The data layout of C (4x16) is as follows (store 4 data in each register):
+// U0  U1  U2  U3
+// U4  U5  U6  U7
+// U8  U9  U10 U11
+// U12 U13 U14 U15
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
+    GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
+    GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, U3, U7
+    GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, U3, U7
+// Now we have the following memory layout of C:
+//     0     1    2   3    ...    15
+// 0 |    |    |    |    |     |     |
+// 1 | G0 | G1 | G2 | G3 | ... | G15 |
+// 2 |    |    |    |    |     |     |
+// 3 |    |    |    |    |     |     |
+// Next we are going to process matrix A with a size of 16x16,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows, quite large.
+//0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15
+//	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31
+//		34	35	36	37	38	39	40	41	42	43	44	45	46	47
+//			51	52	53	54	55	56	57	58	59	60	61	62	63
+//				68	69	70	71	72	73	74	75	76	77	78	79
+//					85	86	87	88	89	90	91	92	93	94	95
+//						102	103	104	105	106	107	108	109	110	111
+//							119	120	121	122	123	124	125	126	127
+//								136	137	138	139	140	141	142	143
+//									153	154	155	156	157	158	159
+//										170	171	172	173	174	175
+//											187	188	189	190	191
+//												204	205	206	207
+//													221	222	223
+//														238	239
+//															255
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 15, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 31, 1, G0
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 1
+    ldrepl_macro 1, 15, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 31, 2, G1
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 2
+    ldrepl_macro 2, 15, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 31, 3, G2
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 3
+    ldrepl_macro 3, 15, 0
+    GMUL xvf, d, G3, G3, U3
+    nmsub_macro 20, 31, 4, G3
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 4
+    ldrepl_macro 4, 15, 0
+    GMUL xvf, d, G4, G4, U4
+    nmsub_macro 21, 31, 5, G4
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 5
+    ldrepl_macro 5, 15, 0
+    GMUL xvf, d, G5, G5, U5
+    nmsub_macro 22, 31, 6, G5
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 6
+    ldrepl_macro 6, 15, 0
+    GMUL xvf, d, G6, G6, U6
+    nmsub_macro 23, 31, 7, G6
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 7
+    ldrepl_macro 7, 15, 0
+    GMUL xvf, d, G7, G7, U7
+    nmsub_macro 24, 31, 8, G7
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 8
+    ldrepl_macro 8, 15, 0
+    GMUL xvf, d, G8, G8, U8
+    nmsub_macro 25, 31, 9, G8
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 9
+    ldrepl_macro 9, 15, 0
+    GMUL xvf, d, G9, G9, U9
+    nmsub_macro 26, 31, 10, G9
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 10
+    ldrepl_macro 10, 15, 0
+    GMUL xvf, d, G10, G10, U10
+    nmsub_macro 27, 31, 11, G10
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 11
+    ldrepl_macro 11, 15, 0
+    GMUL xvf, d, G11, G11, U11
+    nmsub_macro 28, 31, 12, G11
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 12
+    ldrepl_macro 12, 15, 0
+    GMUL xvf, d, G12, G12, U12
+    nmsub_macro 29, 31, 13, G12
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 13
+    ldrepl_macro 13, 15, 0
+    GMUL xvf, d, G13, G13, U13
+    nmsub_macro 30, 31, 14, G13
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 14
+    ldrepl_macro 14, 15, 0
+    GMUL xvf, d, G14, G14, U14
+    nmsub_macro 31, 31, 15, G14
+    PTR_ADDI    A0,      A0,      17 * 8
+// Load 15
+    ldrepl_macro 15, 15, 0
+    GMUL xvf, d, G15, G15, U15
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 31, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
+    GTRANSPOSE4x4_D G8, G9, G10, G11, G8, G9, G10, G11, U0, U1
+    GTRANSPOSE4x4_D G12, G13, G14, G15, G12, G13, G14, G15, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
+              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60, \
+              G2, C2, 0x00, G6, C2, 0x20, G10, C2, 0x40, G14, C2, 0x60, \
+              G3, C3, 0x00, G7, C3, 0x20, G11, C3, 0x40, G15, C3, 0x60
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60, \
+              G1, C1, 0x00, G5, C1, 0x20, G9,  C1, 0x40, G13, C1, 0x60
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, G8,  C0, 0x40, G12, C0, 0x60
+.endif
+.endm
+
+.macro dgemm_dsolve_16x4
+    bge     ZERO,       KK,     .L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    dsolve_16 4
+.endm
+
+.macro dsolve_8 N
+// The data layout of C (4x8) is as follows (store 4 data in each register):
+// U0  U1
+// U2  U3
+// U4  U5
+// U6  U7
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, G8, G9
+    GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, G8, G9
+// Now we have the following memory layout of C:
+//     0     1    2   3    ...   7
+// 0 |    |    |    |    |     |    |
+// 1 | G0 | G1 | G2 | G3 | ... | G7 |
+// 2 |    |    |    |    |     |    |
+// 3 |    |    |    |    |     |    |
+// Next we are going to process matrix A with a size of 8x8,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1	2	3	4	5	6	7
+//	9	10	11	12	13	14	15
+//		18	19	20	21	22	23
+//			27	28	29	30	31
+//				36	37	38	39
+//					45	46	47
+//						54	55
+//							63
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 7, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 23, 1, G0
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 1
+    ldrepl_macro 1, 7, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 23, 2, G1
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 2
+    ldrepl_macro 2, 7, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 23, 3, G2
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 3
+    ldrepl_macro 3, 7, 0
+    GMUL xvf, d, G3, G3, U3
+    nmsub_macro 20, 23, 4, G3
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 4
+    ldrepl_macro 4, 7, 0
+    GMUL xvf, d, G4, G4, U4
+    nmsub_macro 21, 23, 5, G4
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 5
+    ldrepl_macro 5, 7, 0
+    GMUL xvf, d, G5, G5, U5
+    nmsub_macro 22, 23, 6, G5
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 6
+    ldrepl_macro 6, 7, 0
+    GMUL xvf, d, G6, G6, U6
+    nmsub_macro 23, 23, 7, G6
+    PTR_ADDI    A0,      A0,      9 * 8
+// Load 7
+    ldrepl_macro 7, 7, 0
+    GMUL xvf, d, G7, G7, U7
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 23, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+    GTRANSPOSE4x4_D G4, G5, G6, G7, G4, G5, G6, G7, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
+              G1, C1, 0x00, G5, C1, 0x20, \
+              G2, C2, 0x00, G6, C2, 0x20, \
+              G3, C3, 0x00, G7, C3, 0x20
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20, \
+              G1, C1, 0x00, G5, C1, 0x20
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00, G4, C0, 0x20
+.endif
+.endm
+
+.macro dgemm_dsolve_8x4
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    dsolve_8 4
+.endm
+
+.macro dsolve_4 N
+// The data layout of C (4x4) is as follows (store 4 data in each register):
+// U0
+// U1
+// U2
+// U3
+// The first step is to transpose the result of C
+    GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, G4, G5
+// Now we have the following memory layout of C:
+//     0     1    2   3
+// 0 |    |    |    |    |
+// 1 | G0 | G1 | G2 | G3 |
+// 2 |    |    |    |    |
+// 3 |    |    |    |    |
+// Next we are going to process matrix A with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 3, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 19, 1, G0
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 1
+    ldrepl_macro 1, 3, 0
+    GMUL xvf, d, G1, G1, U1
+    nmsub_macro 18, 19, 2, G1
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 2
+    ldrepl_macro 2, 3, 0
+    GMUL xvf, d, G2, G2, U2
+    nmsub_macro 19, 19, 3, G2
+    PTR_ADDI    A0,      A0,      5 * 8
+// Load 3
+    ldrepl_macro 3, 3, 0
+    GMUL xvf, d, G3, G3, U3
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 19, 0, \N
+    GTRANSPOSE4x4_D G0, G1, G2, G3, G0, G1, G2, G3, U0, U1
+.if \N == 4
+    GST xv, , G0, C0, 0x00, G1, C1, 0x00, G2, C2, 0x00, G3, C3, 0x00
+.elseif \N == 2
+    GST xv, , G0, C0, 0x00, G1, C1, 0x00
+.elseif \N == 1
+    GST xv, , G0, C0, 0x00
+.endif
+.endm
+
+.macro dgemm_dsolve_4x4
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    dsolve_4 4
+.endm
+
+.macro dsolve_2 N
+// Transpose
+    GSBUTTERFLY xv, d, G0, G1, U1, U0
+// Now we have the following memory layout of C:
+//     0     1
+// 0 |    |    |
+// 1 | G0 | G1 |
+// 2 |    |    |
+// 3 |    |    |
+// Next we are going to process matrix A with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix A is as follows:
+//0	1
+//	3
+// Sequentially extract data from A in row order
+// Load 0
+    ldrepl_macro 0, 1, 0
+    GMUL xvf, d, G0, G0, U0
+    nmsub_macro 17, 17, 1, G0
+    PTR_ADDI    A0,      A0,      3 * 8
+// Load 1
+    ldrepl_macro 1, 1, 0
+    GMUL xvf, d, G1, G1, U1
+// Finally, We can store the result.
+// For B, stored sequentially, and  C, first transpose and then store
+    B_st_macro 16, 17, 0, \N
+    GSBUTTERFLY xv, d, U0, U1, G1, G0
+.if \N == 4
+    vst       $vr0,     C0,      0x00
+    vst       $vr1,     C1,      0x00
+    xvstelm.d U0,  C2,  0x00,    0x02
+    xvstelm.d U1,  C3,  0x00,    0x02
+    xvstelm.d U0,  C2,  0x08,    0x03
+    xvstelm.d U1,  C3,  0x08,    0x03
+.elseif \N == 2
+    vst       $vr0,     C0,      0x00
+    vst       $vr1,     C1,      0x00
+.elseif \N == 1
+    vst       $vr0,     C0,      0x00
+.endif
+.endm
+
+.macro dgemm_dsolve_2x4
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+/********************** solver ******************/
+.L_dsolve_2x4:
+    dsolve_2 4
+.endm
+
+.macro dgemm_dsolve_1x4
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+.L_dsolve_1x4:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    xvstelm.d   U0,     C2,     0x00,       0x02
+    xvstelm.d   U0,     C3,     0x00,       0x03
+    // Store B
+    xvst    U0,     B0,     0x00
+.endm
+
+.macro dgemm_dsolve_16x2
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    dsolve_16 2
+.endm
+
+.macro dgemm_dsolve_8x2
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    dsolve_8 2
+.endm
+
+.macro dgemm_dsolve_4x2
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    dsolve_4 2
+.endm
+
+.macro dgemm_dsolve_1x2
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+.L_dsolve_1x2:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    xvstelm.d   U0,     C1,     0x00,       0x01
+    // Store B
+    vst    $vr0,     B0,     0x00
+.endm
+
+.macro dgemm_dsolve_2x2
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    dsolve_2 2
+.endm
+
+.macro dgemm_dsolve_16x1
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    dsolve_16 1
+.endm
+
+.macro dgemm_dsolve_8x1
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    dsolve_8 1
+.endm
+
+.macro dgemm_dsolve_4x1
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    dsolve_4 1
+.endm
+
+.macro dgemm_dsolve_2x1
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    dsolve_2 1
+.endm
+
+.macro dgemm_dsolve_1x1
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    GLDREPL xv, d, D0, A0, 0x00
+    GMUL xvf, d, U0, U0, D0
+    // Store C
+    xvstelm.d   U0,     C0,     0x00,       0x00
+    // Store B
+    xvstelm.d   U0,     B0,     0x00,       0x00
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi       N,     N,     0x03
+    beq        ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_SLLI    T0,     K,      5
+    PTR_SLLI    T1,     LDC,    2
+    PTR_ADD     B,      B,      T0 // b += 4 * k
+    PTR_ADD     C,      C,      T1 // c += 4 * ldc
+    bnez        J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+.L_N2:
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_SLLI    T0,     K,      4
+    PTR_SLLI    T1,     LDC,    1
+    PTR_ADD     B,      B,      T0 // b += 2 * k
+    PTR_ADD     C,      C,      T1 // c += 2 * ldc
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+
+    move        KK,     OFFSET
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADDI    KK,     KK,     0x10 // kk += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADDI    KK,     KK,     0x08 // kk += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADDI    KK,     KK,     0x04 // kk += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADDI    KK,     KK,     0x02 // kk += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADDI    KK,     KK,     0x01 // kk += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
new file mode 100644
index 000000000..421339736
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_RN_16x4_lasx.S
@@ -0,0 +1,882 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/09/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     B0
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+#define G0     D0
+#define G1     D1
+#define G2     D2
+#define G3     D3
+#define G4     D4
+#define G5     D5
+#define G6     D6
+#define G7     D7
+#define G8     D8
+#define G9     D9
+#define G10    D10
+#define G11    D11
+#define G12    D12
+#define G13    D13
+#define G14    D14
+#define G15    D15
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, B0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+
+.macro nmsub_macro start0, end0, start1, reg
+// Ux -= reg * Dx
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+
+.macro A_st_macro start, end, stride, N
+// Store Ux(x = 0...15)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, A0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, A0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, A0, \stride * 0x08
+.endif
+    A_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 4, 7, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U4, D4, U4, U5, D4, U5, U6, D4, U6, U7, D4, U7
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 8, 11, 0, D2
+    nmsub_macro 8, 11, 4, D5
+    GMUL xvf, d, U8, D7, U8, U9, D7, U9, U10, D7, U10, U11, D7, U11
+    nmsub_macro 12, 15, 0, D3
+    nmsub_macro 12, 15, 4, D6
+    nmsub_macro 12, 15, 8, D8
+    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
+// Store A
+    A_st_macro 0, 15, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
+              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
+              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+.endm
+
+.macro dsolve_16x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 4, 7, 0, D1
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+.endm
+
+.macro dsolve_8x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 2, 3, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U2, D4, U2, U3, D4, U3
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 4, 5, 0, D2
+    nmsub_macro 4, 5, 2, D5
+    GMUL xvf, d, U4, D7, U4, U5, D7, U5
+    nmsub_macro 6, 7, 0, D3
+    nmsub_macro 6, 7, 2, D6
+    nmsub_macro 6, 7, 4, D8
+    GMUL xvf, d, U6, D9, U6, U7, D9, U7
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
+              U2, C1, 0x00, U3, C1, 0x20, \
+              U4, C2, 0x00, U5, C2, 0x20, \
+              U6, C3, 0x00, U7, C3, 0x20
+.endm
+
+.macro dsolve_8x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 2, 3, 0, D1
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
+              U2,  C1, 0x00, U3,  C1, 0x20
+.endm
+
+.macro dsolve_4x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
+.endm
+
+.macro dsolve_4x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00
+.endm
+
+.macro dsolve_2x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00,
+.endm
+
+.macro dsolve_2x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
+.endm
+
+.macro dsolve_1x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1	2	3
+//	5	6	7
+//		10	11
+//			15
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 19, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 20, 22, 5
+    nmsub_macro 1, 1, 0, D1
+    ldrepl_macro 23, 24, 10
+    GMUL xvf, d, U1, D4, U1
+
+    ldrepl_macro 25, 25, 15
+    nmsub_macro 2, 2, 0, D2
+    nmsub_macro 2, 2, 1, D5
+    GMUL xvf, d, U2, D7, U2
+    nmsub_macro 3, 3, 0, D3
+    nmsub_macro 3, 3, 1, D6
+    nmsub_macro 3, 3, 2, D8
+    GMUL xvf, d, U3, D9, U3
+// Store A
+    A_st_macro 0, 3, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
+.endm
+
+.macro dsolve_1x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0	1
+//	3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 17, 0
+    GMUL xvf, d, U0, D0, U0
+    ldrepl_macro 18, 18, 3
+    nmsub_macro 1, 1, 0, D1
+    GMUL xvf, d, U1, D2, U1
+// Store A
+    A_st_macro 0, 1, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
+.endm
+
+.macro dgemm_dsolve_16x4
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    dsolve_16x4
+.endm
+
+.macro dgemm_dsolve_8x4
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    dsolve_8x4
+.endm
+
+.macro dgemm_dsolve_4x4
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    dsolve_4x4
+.endm
+
+.macro dgemm_dsolve_2x4
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/********************** solver ******************/
+.L_dsolve_2x4:
+    dsolve_2x4
+.endm
+
+.macro dgemm_dsolve_1x4
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    xvpackod.d  U1,     U0,     U0
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+.L_dsolve_1x4:
+    dsolve_1x4
+.endm
+
+.macro dgemm_dsolve_16x2
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    dsolve_16x2
+.endm
+
+.macro dgemm_dsolve_8x2
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    dsolve_8x2
+.endm
+
+.macro dgemm_dsolve_4x2
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    dsolve_4x2
+.endm
+
+.macro dgemm_dsolve_2x2
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    dsolve_2x2
+.endm
+
+.macro dgemm_dsolve_1x2
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    xvpackod.d  U1,     U0,     U0
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+.L_dsolve_1x2:
+    dsolve_1x2
+.endm
+
+.macro dgemm_dsolve_16x1
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    // Store A
+    A_st_macro 0, 3, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
+.endm
+
+.macro dgemm_dsolve_8x1
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    // Store A
+    A_st_macro 0, 1, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20
+.endm
+
+.macro dgemm_dsolve_4x1
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_2x1
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 2
+    // Strore C
+    GST v, , $vr0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_1x1
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 1
+    // Strore C
+    GST f, d, $f0, C0, 0x00
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI   LDC,   LDC,   3
+    PTR_SUB    KK,    ZERO,  OFFSET
+    /* if (!(N >> 2)) goto L_N3 */
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    andi       N,     N,     0x03
+    beq        ZERO,  J,     .L_N3
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_SLLI    T0,     K,      5
+    PTR_SLLI    T1,     LDC,    2
+    PTR_ADD     B,      B,      T0 // b += 4 * k
+    PTR_ADD     C,      C,      T1 // c += 4 * ldc
+    PTR_ADDI    KK,     KK,     4 // kk += 4
+    bnez        J,      .L_J1
+.L_N3:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N1
+.L_N2:
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_SLLI    T0,     K,      4
+    PTR_SLLI    T1,     LDC,    1
+    PTR_ADD     B,      B,      T0 // b += 2 * k
+    PTR_ADD     C,      C,      T1 // c += 2 * ldc
+    PTR_ADDI    KK,     KK,     2 // kk += 2
+.L_N1:
+    andi    J,      N,      1
+    beq     ZERO,   J,      .L_N0
+    move        AA,     A
+    move        CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    GADD , d, C0, CC, ZERO
+    move        A0,     AA
+    move        B0,     B
+    move        L,      KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
new file mode 100644
index 000000000..5f86d75b5
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_RT_16x4_lasx.S
@@ -0,0 +1,953 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/09/26 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
+ *          FLOAT *c, BLASLONG ldc, BLASLONG offset)
+ */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+#define OFFSET $r11  // param 9: offset
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define T0     $r25
+#define T1     $r26
+#define T2     $r27
+#define KK     $r28
+#define AA     $r29
+#define CC     $r30
+#define BB     $r31
+#undef  ZERO
+#define ZERO   $r0
+
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+#include "dtrsm_kernel_macro.S"
+
+.macro ldrepl_macro start, end, stride
+// Load Ux (x = 0...15)
+.if \start <= \end
+    GLDREPL xv, d, $xr\start, B0, \stride * 8
+    ldrepl_macro %start + 1, \end, %stride + 1
+.endif
+.endm
+
+.macro nmsub_macro start0, end0, start1, reg
+// Ux -= reg * Dx
+.if \start0 <= \end0
+    xvfnmsub.d  $xr\start0, \reg, $xr\start1, $xr\start0
+    nmsub_macro %start0 + 1, \end0, %start1 + 1, \reg
+.endif
+.endm
+
+.macro A_st_macro start, end, stride, N
+// Store Ux(x = 0...15)
+.if \start <= \end
+.if \N == 4
+    xvst    $xr\start, A0, \stride * 0x20
+.elseif \N == 2
+    vst     $vr\start, A0, \stride * 0x10
+.elseif \N == 1
+    fst.d   $f\start, A0, \stride * 0x08
+.endif
+    A_st_macro %start + 1, \end, %stride + 1, \N
+.endif
+.endm
+
+.macro dsolve_16x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+    nmsub_macro 0, 3, 4, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+.endm
+
+.macro dsolve_8x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+    nmsub_macro 0, 1, 2, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, \
+              U2,  C1, 0x00, U3,  C1, 0x20
+.endm
+
+.macro dsolve_4x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00
+.endm
+
+.macro dsolve_2x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00
+.endm
+
+.macro dsolve_1x2
+// We are going to process matrix B with a size of 2x2,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//2	 3
+// Sequentially extract data from B in row order
+    ldrepl_macro 16, 16, 0
+    ldrepl_macro 17, 18, 2
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 1, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00
+.endm
+
+.macro dsolve_16x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U12, D9, U12, U13, D9, U13, U14, D9, U14, U15, D9, U15
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 8, 11, 12, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U8, D5, U8, U9, D5, U9, U10, D5, U10, U11, D5, U11
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 4, 7, 12, D7
+    nmsub_macro 4, 7, 8, D4
+    GMUL xvf, d, U4, D2, U4, U5, D2, U5, U6, D2, U6, U7, D2, U7
+    nmsub_macro 0, 3, 12, D6
+    nmsub_macro 0, 3, 8, D3
+    nmsub_macro 0, 3, 4, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+// Store A
+    A_st_macro 0, 15, 0, 4
+// Store C
+    GST xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60, \
+              U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60, \
+              U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60, \
+              U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+.endm
+
+.macro dsolve_8x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U6, D9, U6, U7, D9, U7
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 4, 5, 6, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U4, D5, U4, U5, D5, U5
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 2, 3, 6, D7
+    nmsub_macro 2, 3, 4, D4
+    GMUL xvf, d, U2, D2, U2, U3, D2, U3
+    nmsub_macro 0, 1, 6, D6
+    nmsub_macro 0, 1, 4, D3
+    nmsub_macro 0, 1, 2, D1
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+// Store A
+    A_st_macro 0, 7, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, \
+              U2, C1, 0x00, U3, C1, 0x20, \
+              U4, C2, 0x00, U5, C2, 0x20, \
+              U6, C3, 0x00, U7, C3, 0x20
+.endm
+
+.macro dsolve_4x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 4
+// Store C
+    GST xv, , U0, C0, 0x00, U1, C1, 0x00, U2, C2, 0x00, U3, C3, 0x00
+.endm
+
+.macro dsolve_2x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 2
+// Store C
+    GST v, , $vr0, C0, 0x00, $vr1, C1, 0x00, $vr2, C2, 0x00, $vr3, C3, 0x00
+.endm
+
+.macro dsolve_1x4
+// We are going to process matrix B with a size of 4x4,
+// using only the upper triangular portion. The memory layout of
+// matrix B is as follows:
+//0
+//4	 5
+//8	 9	10
+//12 13	14	15
+// Sequentially extract data from B in row order
+    ldrepl_macro 22, 25, 12
+    GMUL xvf, d, U3, D9, U3
+    ldrepl_macro 19, 21, 8
+    nmsub_macro 2, 2, 3, D8
+    ldrepl_macro 17, 18, 4
+    GMUL xvf, d, U2, D5, U2
+    ldrepl_macro 16, 16, 0
+    nmsub_macro 1, 1, 3, D7
+    nmsub_macro 1, 1, 2, D4
+    GMUL xvf, d, U1, D2, U1
+    nmsub_macro 0, 0, 3, D6
+    nmsub_macro 0, 0, 2, D3
+    nmsub_macro 0, 0, 1, D1
+    GMUL xvf, d, U0, D0, U0
+// Store A
+    A_st_macro 0, 3, 0, 1
+// Store C
+    GST f, d, $f0, C0, 0x00, $f1, C1, 0x00, $f2, C2, 0x00, $f3, C3, 0x00,
+.endm
+
+.macro dgemm_dsolve_16x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x1_load
+    dgemm_16x1
+    b .L_dsolve_16x1
+.L_dsolve_16x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+.L_dsolve_16x1:
+    PTR_ADDI    A0,     T1,     -16 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1, U2, D0, U2, U3, D0, U3
+    // Store A
+    A_st_macro 0, 3, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
+.endm
+
+.macro dgemm_dsolve_8x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x1_load
+    dgemm_8x1
+    b .L_dsolve_8x1
+.L_dsolve_8x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+.L_dsolve_8x1:
+    PTR_ADDI    A0,     T1,     -8 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0, U1, D0, U1
+    // Store A
+    A_st_macro 0, 1, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00, U1, C0, 0x20
+.endm
+
+.macro dgemm_dsolve_4x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x1_load
+    dgemm_4x1
+    b .L_dsolve_4x1
+.L_dsolve_4x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_4x1:
+    PTR_ADDI    A0,     T1,     -4 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 4
+    // Strore C
+    GST xv, , U0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_2x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x1_load
+    dgemm_2x1
+    b .L_dsolve_2x1
+.L_dsolve_2x1_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+.L_dsolve_2x1:
+    PTR_ADDI    A0,     T1,     -2 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 2
+    // Strore C
+    GST v, , $vr0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_1x1
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x1_load
+    dgemm_1x1
+    b .L_dsolve_1x1
+.L_dsolve_1x1_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+.L_dsolve_1x1:
+    PTR_ADDI    A0,     T1,     -1 * 8
+    PTR_ADDI    B0,     T2,     -1 * 8
+    ldrepl_macro 16, 16, 0
+    GMUL xvf, d, U0, D0, U0
+    // Store A
+    A_st_macro 0, 0, 0, 1
+    // Strore C
+    GST f, d, $f0, C0, 0x00
+.endm
+
+.macro dgemm_dsolve_16x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x2_load
+    dgemm_16x2
+    b .L_dsolve_16x2
+.L_dsolve_16x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+.L_dsolve_16x2:
+    PTR_ADDI    A0,     T1,     -(16 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_16x2
+.endm
+
+.macro dgemm_dsolve_8x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x2_load
+    dgemm_8x2
+    b .L_dsolve_8x2
+.L_dsolve_8x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+.L_dsolve_8x2:
+    PTR_ADDI    A0,     T1,     -(8 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_8x2
+.endm
+
+.macro dgemm_dsolve_4x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_4x2_load
+    dgemm_4x2
+    b .L_dsolve_4x2
+.L_dsolve_4x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_4x2:
+    PTR_ADDI    A0,     T1,     -(4 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_4x2
+.endm
+
+.macro dgemm_dsolve_2x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_2x2_load
+    dgemm_2x2
+    b .L_dsolve_2x2
+.L_dsolve_2x2_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+.L_dsolve_2x2:
+    PTR_ADDI    A0,     T1,     -(2 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_2x2
+.endm
+
+.macro dgemm_dsolve_1x2
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x2_load
+    dgemm_1x2
+    xvpackod.d  U1,     U0,     U0
+    b   .L_dsolve_1x2
+.L_dsolve_1x2_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+.L_dsolve_1x2:
+    PTR_ADDI    A0,     T1,     -(1 * 2) * 8
+    PTR_ADDI    B0,     T2,     -(2 * 2) * 8
+    dsolve_1x2
+.endm
+
+.macro dgemm_dsolve_16x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_16x4_load
+    dgemm_16x4
+    b	.L_dsolve_16x4
+.L_dsolve_16x4_load:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+/********************** solver ******************/
+.L_dsolve_16x4:
+    PTR_ADDI    A0,     T1,     -(16 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_16x4
+.endm
+
+.macro dgemm_dsolve_8x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,	.L_dsolve_8x4_load
+    dgemm_8x4
+    b .L_dsolve_8x4
+.L_dsolve_8x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+/********* solver *********/
+.L_dsolve_8x4:
+    PTR_ADDI    A0,     T1,     -(8 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_8x4
+.endm
+
+.macro dgemm_dsolve_4x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_4x4_load
+    dgemm_4x4
+    b .L_dsolve_4x4
+.L_dsolve_4x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/************** solver *****************/
+.L_dsolve_4x4:
+    PTR_ADDI    A0,     T1,     -(4 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_4x4
+.endm
+
+.macro dgemm_dsolve_2x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_2x4_load
+    dgemm_2x4
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_2x4
+.L_dsolve_2x4_load:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+/********************** solver ******************/
+.L_dsolve_2x4:
+    PTR_ADDI    A0,     T1,     -(2 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_2x4
+.endm
+
+.macro dgemm_dsolve_1x4
+    or    T1,   A0,     A0
+    or    T2,   B0,     B0
+    bge   ZERO, L,    .L_dsolve_1x4_load
+    dgemm_1x4
+    xvpackod.d  U1,     U0,     U0
+    xvpermi.q   U2,     U0,     0x01
+    xvpermi.q   U3,     U1,     0x01
+    b   .L_dsolve_1x4
+.L_dsolve_1x4_load:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+.L_dsolve_1x4:
+    PTR_ADDI    A0,     T1,     -(1 * 4) * 8
+    PTR_ADDI    B0,     T2,     -(4 * 4) * 8
+    dsolve_1x4
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    PTR_SLLI    LDC,    LDC,    3
+    PTR_SUB     KK,     N,      OFFSET
+    PTR_MUL     T0,     N,      LDC
+    PTR_MUL     T1,     N,      K
+    PTR_ADD     C,      C,      T0 // c += n * ldc
+    PTR_SLLI    T1,     T1,     3
+    PTR_ADD     B,      B,      T1
+
+    andi        J,      N,      1
+    beqz        J,      .L_N2
+.L_N1:
+    move        AA,     A
+    PTR_SUB     C,      C,      LDC // c -= ldc
+    PTR_SLLI    T0,     K,      3
+    PTR_SLLI    T1,     KK,     3
+    PTR_SUB     B,      B,      T0 // b -= k
+    PTR_ADD     BB,     B,      T1 // bb = b + kk
+    move        CC,     C
+
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N1_M15
+.align 4
+.L_N1_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x1
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N1_I1
+.L_N1_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N1_M7
+.L_N1_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x1
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N1_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N1_M3
+.L_N1_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x1
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N1_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N1_M1
+.L_N1_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x1
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N1_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N1_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x1
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N1_M0:
+    PTR_ADDI    KK,     KK,     -1
+.L_N2:
+    andi    J,      N,      2
+    beq     ZERO,   J,      .L_N4
+    move    AA,     A
+    PTR_SLLI    T0, LDC,    1
+    PTR_SLLI    T1, K,      4
+    PTR_SLLI    T2, KK,     4
+    PTR_SUB     B,  B,      T1
+    PTR_SUB     C,  C,      T0
+    PTR_ADD     BB, B,      T2
+    move    CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_N2_M15
+.align 4
+.L_N2_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x2
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_N2_I1
+.L_N2_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_N2_M7
+.L_N2_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x2
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_N2_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_N2_M3
+.L_N2_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x2
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_N2_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_N2_M1
+.L_N2_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x2
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_N2_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_N2_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO, C1, C0, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x2
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_N2_M0:
+    PTR_ADDI    KK,     KK,     -2
+.L_N4:
+    PTR_SRAI   J,     N,     2     /* J = bn >> 2 */
+    beq        ZERO,  J,     .L_N0
+.align 5
+.L_J1:
+    PTR_ADDI    J,      J,     -1
+    move    AA,     A
+    PTR_SLLI    T0, LDC,    2
+    PTR_SLLI    T1, K,      5
+    PTR_SLLI    T2, KK,     5
+    PTR_SUB     B,  B,      T1
+    PTR_SUB     C,  C,      T0
+    PTR_ADD     BB, B,      T2
+    move    CC,     C
+    PTR_SRAI    I,      M,      4 // M >> 4
+    beqz        I,      .L_M15
+.align 4
+.L_I1:
+    PTR_SLLI    T1,     KK,     7
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 16 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_16x4
+    PTR_ADDI    I,      I,      -1
+    PTR_SLLI    T0,     K,      7
+    PTR_ADDI    CC,     CC,     0x80 // cc += 16
+    PTR_ADD     AA,     AA,     T0 // aa += 16 * k
+    bnez        I,      .L_I1
+.L_M15:
+    andi        I,      M,      8
+    beqz        I,      .L_M7
+.L_M8:
+    PTR_SLLI    T1,     KK,     6
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 8 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_8x4
+    PTR_SLLI    T0,     K,      6
+    PTR_ADDI    CC,     CC,     0x40 // cc += 8
+    PTR_ADD     AA,     AA,     T0 // aa += 8 * k
+.L_M7:
+    andi        I,      M,      4
+    beqz        I,      .L_M3
+.L_M4:
+    PTR_SLLI    T1,     KK,     5
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 4 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_4x4
+    PTR_SLLI    T0,     K,      5
+    PTR_ADDI    CC,     CC,     0x20 // cc += 4
+    PTR_ADD     AA,     AA,     T0 // aa += 4 * k
+.L_M3:
+    andi        I,      M,      2
+    beqz        I,      .L_M1
+.L_M2:
+    PTR_SLLI    T1,     KK,     4
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + 2 * kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_2x4
+    PTR_SLLI    T0,     K,      4
+    PTR_ADDI    CC,     CC,     0x10 // cc += 2
+    PTR_ADD     AA,     AA,     T0 // aa += 2 * k
+.L_M1:
+    andi        I,      M,      1
+    beqz        I,      .L_M0
+    PTR_SLLI    T1,     KK,     3
+    GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
+    PTR_ADD     A0,     AA,     T1 // a0 = aa + kk
+    move        B0,     BB
+    PTR_SUB     L,      K,      KK // L = K - KK
+    dgemm_dsolve_1x4
+    PTR_SLLI    T0,     K,      3
+    PTR_ADDI    CC,     CC,     0x08 // cc += 1
+    PTR_ADD     AA,     AA,     T0 // aa += 1 * k
+.L_M0:
+    PTR_ADDI    KK,     KK,     -4
+    bnez        J,      .L_J1
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/dtrsm_kernel_macro.S b/kernel/loongarch64/dtrsm_kernel_macro.S
new file mode 100644
index 000000000..88b7121d1
--- /dev/null
+++ b/kernel/loongarch64/dtrsm_kernel_macro.S
@@ -0,0 +1,2147 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+/************** Dgemm Kernel 16x4 ****************/
+.macro KERNEL2x16x4
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
+
+    preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U12,  B0, 0x00
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    preld      0,   B0, B_PRE
+    xvldrepl.d U13,  B0, 0x08
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
+
+    preld      0,   A0, A_PRE
+    xvldrepl.d U14,  B0, 0x10
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x16x4_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    preld      0,   B0, B_PRE
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D10, U10, U14, D10
+    xvfmadd.d  D11, U11, U14, D11
+
+    preld      0,   A0, A_PRE
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D14, U10, U15, D14
+    xvfmadd.d  D15, U11, U15, D15
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    preld      0,   B0, B_PRE
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
+
+    preld      0,   A0, A_PRE
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    preld      0,   A0, A_PRE + 0x40
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+.endm
+
+.macro KERNEL8x16x4
+.rept 4
+    KERNEL2x16x4
+.endr
+.endm
+
+.macro KERNEL8x16x4_END
+.rept 3
+    KERNEL2x16x4
+.endr
+    KERNEL2x16x4_END
+.endm
+
+.macro KERNEL2x8x4
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    xvldrepl.d U12, B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x8x4_END
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+    xvfmadd.d  D9,  U9, U14, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+    xvfmadd.d  D13, U9, U15, D13
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+.endm
+
+.macro KERNEL8x8x4
+.rept 4
+    KERNEL2x8x4
+.endr
+.endm
+
+.macro KERNEL8x8x4_END
+.rept 3
+    KERNEL2x8x4
+.endr
+    KERNEL2x8x4_END
+.endm
+
+.macro KERNEL2x4x4
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    xvld     U8,   A0,    0x00
+
+    xvldrepl.d U12, B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U13, B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U14, B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+
+    xvldrepl.d U15,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x4x4_END
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U8, U12, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U8, U14, D8
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U8, U15, D12
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D12, U0, U7, D12
+.endm
+
+.macro KERNEL8x4x4
+.rept 4
+    KERNEL2x4x4
+.endr
+.endm
+
+.macro KERNEL8x4x4_END
+.rept 3
+    KERNEL2x4x4
+.endr
+    KERNEL2x4x4_END
+.endm
+
+.macro KERNEL2x2x4
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
+
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
+
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+
+    xvld       U12, B0, 0x00
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x2x4_END
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvfmadd.d   D1,     U9,     U12,    D1
+
+    xvld    U4,  B0, 0x00
+    addi.d  A0,  A0, 0x10
+    addi.d  B0,  B0, 0x20
+
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+.endm
+
+.macro KERNEL8x2x4
+.rept 4
+    KERNEL2x2x4
+.endr
+.endm
+
+.macro KERNEL8x2x4_END
+.rept 3
+    KERNEL2x2x4
+.endr
+    KERNEL2x2x4_END
+.endm
+
+.macro KERNEL2x1x4
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    xvldrepl.d  U8,     A0,     0x00
+    xvfmadd.d   D0,     U0,     U4,     D0
+    xvld        U12,    B0,     0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+.endm
+
+.macro KERNEL2x1x4_END
+    xvldrepl.d  U0,     A0,     0x00
+    xvfmadd.d   D0,     U8,     U12,    D0
+    xvld        U4,     B0,     0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x20
+
+    xvfmadd.d   D0,     U0,     U4,     D0
+.endm
+
+.macro KERNEL8x1x4
+.rept 4
+    KERNEL2x1x4
+.endr
+.endm
+
+.macro KERNEL8x1x4_END
+.rept 3
+    KERNEL2x1x4
+.endr
+    KERNEL2x1x4_END
+.endm
+
+.macro KERNEL2x16x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvld     U11,   A0,    0x60
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x16x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvld     U3,   A0,    0x60
+    xvfmadd.d  D6,  U10, U13, D6
+    xvfmadd.d  D7,  U11, U13, D7
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+.endm
+
+.macro KERNEL8x16x2
+.rept 4
+    KERNEL2x16x2
+.endr
+.endm
+
+.macro KERNEL8x16x2_END
+.rept 3
+    KERNEL2x16x2
+.endr
+    KERNEL2x16x2_END
+.endm
+
+.macro KERNEL2x8x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x8x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D4,  U8, U13, D4
+    xvfmadd.d  D5,  U9, U13, D5
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+.endm
+
+.macro KERNEL8x8x2
+.rept 4
+    KERNEL2x8x2
+.endr
+.endm
+
+.macro KERNEL8x8x2_END
+.rept 3
+    KERNEL2x8x2
+ .endr
+    KERNEL2x8x2_END
+.endm
+
+.macro KERNEL2x4x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x4x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x4x2
+.rept 4
+    KERNEL2x4x2
+.endr
+.endm
+
+.macro KERNEL8x4x2_END
+.rept 3
+    KERNEL2x4x2
+.endr
+    KERNEL2x4x2_END
+.endm
+
+.macro KERNEL2x2x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x2x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x2x2
+.rept 4
+    KERNEL2x2x2
+.endr
+.endm
+
+.macro KERNEL8x2x2_END
+.rept 3
+    KERNEL2x2x2
+.endr
+    KERNEL2x2x2_END
+.endm
+
+.macro KERNEL2x1x2
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+
+    xvldrepl.d U12,  B0, 0x00
+    xvldrepl.d U13,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+.endm
+
+.macro KERNEL2x1x2_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D4,  U8, U13, D4
+
+    xvldrepl.d U4,  B0, 0x00
+    xvldrepl.d U5,  B0, 0x08
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D4,  U0, U5, D4
+.endm
+
+.macro KERNEL8x1x2
+.rept 4
+    KERNEL2x1x2
+.endr
+.endm
+
+.macro KERNEL8x1x2_END
+.rept 3
+    KERNEL2x1x2
+.endr
+    KERNEL2x1x2_END
+.endm
+
+.macro KERNEL2x16x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvld     U9,   A0,    0x20
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvld     U10,   A0,    0x40
+    xvld     U11,   A0,    0x60
+
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x16x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+
+    xvld     U1,   A0,    0x20
+    xvfmadd.d  D2,  U10, U12, D2
+    xvfmadd.d  D3,  U11, U12, D3
+
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x80
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+.endm
+
+.macro KERNEL8x16x1
+.rept 4
+    KERNEL2x16x1
+.endr
+.endm
+
+.macro KERNEL8x16x1_END
+.rept 3
+    KERNEL2x16x1
+.endr
+    KERNEL2x16x1_END
+.endm
+
+.macro KERNEL2x8x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvld     U9,   A0,    0x20
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x8x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvfmadd.d  D1,  U9, U12, D1
+    xvld     U1,   A0,    0x20
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x40
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+.endm
+
+.macro KERNEL8x8x1
+.rept 4
+    KERNEL2x8x1
+.endr
+.endm
+
+.macro KERNEL8x8x1_END
+.rept 3
+    KERNEL2x8x1
+.endr
+    KERNEL2x8x1_END
+.endm
+
+.macro KERNEL2x4x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x4x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x20
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x4x1
+.rept 4
+    KERNEL2x4x1
+.endr
+.endm
+
+.macro KERNEL8x4x1_END
+.rept 3
+    KERNEL2x4x1
+.endr
+    KERNEL2x4x1_END
+.endm
+
+.macro KERNEL2x2x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x2x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x10
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x2x1
+.rept 4
+    KERNEL2x2x1
+.endr
+.endm
+
+.macro KERNEL8x2x1_END
+.rept 3
+    KERNEL2x2x1
+.endr
+    KERNEL2x2x1_END
+.endm
+
+.macro KERNEL2x1x1
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    xvld     U8,   A0,    0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvldrepl.d U12,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+.endm
+
+.macro KERNEL2x1x1_END
+    xvld     U0,   A0,    0x00
+    xvfmadd.d  D0,  U8, U12, D0
+    xvldrepl.d U4,  B0, 0x00
+
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    xvfmadd.d  D0,  U0, U4, D0
+.endm
+
+.macro KERNEL8x1x1
+.rept 4
+    KERNEL2x1x1
+.endr
+.endm
+
+.macro KERNEL8x1x1_END
+.rept 3
+    KERNEL2x1x1
+.endr
+    KERNEL2x1x1_END
+.endm
+
+.macro dgemm_16x4
+.L_dgemm_16x4: // See dgemm_kernel_16x4.S
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
+
+    xvldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
+    xvfmul.d  D10, U2, U6
+    xvfmul.d  D11, U3, U6
+
+    xvldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
+    xvfmul.d  D14, U2, U7
+    xvfmul.d  D15, U3, U7
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x80
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_L7 */
+    beq       ZERO,TL, .L_dgemm_16x4_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_16x4_TL1_END
+.align 5
+.L_dgemm_16x4_TL1:
+    KERNEL8x16x4
+    PTR_ADDI  TL,    TL,     -1
+    blt     ZERO,  TL,     .L_dgemm_16x4_TL1
+.L_dgemm_16x4_TL1_END:
+    KERNEL8x16x4_END
+.L_dgemm_16x4_L7:
+    andi      TL,  L,    7
+    beq       TL,  ZERO, .L_dgemm_16x4_L0
+.align 5
+.L_dgemm_16x4_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+    xvfmadd.d  D10, U2, U6, D10
+    xvfmadd.d  D11, U3, U6, D11
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+    xvfmadd.d  D14, U2, U7, D14
+    xvfmadd.d  D15, U3, U7, D15
+
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_16x4_L71
+.L_dgemm_16x4_L0:
+    // Load C
+    GLD xv, , U0,  C0, 0x00, U1,  C0, 0x20, U2,  C0, 0x40, U3,  C0, 0x60
+    GLD xv, , U4,  C1, 0x00, U5,  C1, 0x20, U6,  C1, 0x40, U7,  C1, 0x60
+    GLD xv, , U8,  C2, 0x00, U9,  C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
+    GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
+    GSUB xvf, d, U0,  U0,  D0,  U1,  U1,  D1,  U2,  U2,  D2,  U3,  U3,  D3,  \
+                 U4,  U4,  D4,  U5,  U5,  D5,  U6,  U6,  D6,  U7,  U7,  D7,  \
+                 U8,  U8,  D8,  U9,  U9,  D9,  U10, U10, D10, U11, U11, D11, \
+                 U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15
+.endm
+
+.macro dgemm_1x4
+.L_dgemm_1x4: // See dgemm_kernel_16x4.S
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmul.d    D0,     U0,     U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x08
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_dgemm_1x4_M1_L7
+    xvldrepl.d  U8,     A0,     0x00
+
+    PTR_ADDI     TL,  TL,  -1
+    xvld       U12, B0,  0x00
+    PTR_ADDI     A0,  A0,  0x08
+    PTR_ADDI     B0,  B0,  0x20
+
+    beq    ZERO,    TL,  .L_dgemm_1x4_M1_TL1_END
+.align 5
+.L_dgemm_1x4_M1_TL1:
+    KERNEL8x1x4
+    PTR_ADDI  TL,     TL,     -1
+    blt     ZERO,TL, .L_dgemm_1x4_M1_TL1
+.L_dgemm_1x4_M1_TL1_END:
+    KERNEL8x1x4_END
+.L_dgemm_1x4_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_1x4_M1_L0
+.align 5
+.L_dgemm_1x4_M1_L71:
+    xvldrepl.d  U0,     A0,     0x00
+    xvld        U4,     B0,     0x00
+    xvfmadd.d   D0,     U0,     U4,    D0
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x08
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_1x4_M1_L71
+.L_dgemm_1x4_M1_L0:
+    // Load C
+    fld.d       $f0,    C0,     0x00
+    fld.d       $f1,    C1,     0x00
+    fld.d       $f2,    C2,     0x00
+    fld.d       $f3,    C3,     0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  U0,     U2,     0x02
+    xvinsve0.d  U0,     U3,     0x03
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_2x4
+.L_dgemm_2x4:
+    /* Load 2 * 64 from A0 */
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+    xvld    U4,     B0,     0x00
+    xvfmul.d    D0,     U0,     U4
+    xvfmul.d    D1,     U1,     U4
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x10
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_dgemm_2x4_M2_L7
+
+    xvldrepl.d     U8,   A0,    0x00
+    xvldrepl.d     U9,   A0,    0x08
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvld       U12, B0, 0x00
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_2x4_M2_TL1_END
+.align 5
+.L_dgemm_2x4_M2_TL1:
+    KERNEL8x2x4
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_2x4_M2_TL1
+.L_dgemm_2x4_M2_TL1_END:
+    KERNEL8x2x4_END
+
+.L_dgemm_2x4_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_2x4_M2_L0
+.align 5
+.L_dgemm_2x4_M2_L71:
+    xvldrepl.d     U0,   A0,    0x00
+    xvldrepl.d     U1,   A0,    0x08
+
+    xvld    U4,  B0, 0x00
+
+    xvfmadd.d   D0,     U0,     U4,    D0
+    xvfmadd.d   D1,     U1,     U4,    D1
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_2x4_M2_L71
+.L_dgemm_2x4_M2_L0:
+    xvpackev.d  D4,     D1,     D0
+    xvpackod.d  D5,     D1,     D0
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    xvpermi.q   U0, U2, 0x02
+    xvpermi.q   U1, U3, 0x02
+
+    GSUB xvf, d, U0, U0, D4, U1, U1, D5
+.endm
+
+.macro dgemm_4x4
+.L_dgemm_4x4:
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+
+    xvldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+
+    xvldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x20
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_dgemm_4x4_M4_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_4x4_M4_TL1_END
+.align 5
+.L_dgemm_4x4_M4_TL1: /* TL-- */
+    KERNEL8x4x4
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x4_M4_TL1
+.L_dgemm_4x4_M4_TL1_END:
+    KERNEL8x4x4_END
+.L_dgemm_4x4_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_4x4_M4_L0
+.align 5
+.L_dgemm_4x4_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U4,  B0, 0x08
+    xvfmadd.d  D4,  U0, U4, D4
+
+    xvldrepl.d U4,  B0, 0x10
+    xvfmadd.d  D8,  U0, U4, D8
+
+    xvldrepl.d U4,  B0, 0x18
+    xvfmadd.d  D12, U0, U4, D12
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x4_M4_L71
+ .L_dgemm_4x4_M4_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    /* Load C2  */
+    xvld      U2,  C2,  0x00
+    /* Load C3  */
+    xvld      U3,  C3,  0x00
+
+    GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12
+.endm
+
+.macro dgemm_8x4
+.L_dgemm_8x4:
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+
+    xvldrepl.d     U6, B0, 0x10
+    /* line 3 */
+    xvfmul.d  D8,  U0, U6
+    xvfmul.d  D9,  U1, U6
+
+    xvldrepl.d     U7, B0, 0x18
+    /* line 4 */
+    xvfmul.d  D12, U0, U7
+    xvfmul.d  D13, U1, U7
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x40
+    PTR_ADDI    B0,  B0, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_dgemm_8x4_M8_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    xvldrepl.d  U14,   B0,  0x10
+    xvldrepl.d  U15,   B0,  0x18
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x20
+
+    beq    ZERO,    TL,  .L_dgemm_8x4_M8_TL1_END
+.align 5
+.L_dgemm_8x4_M8_TL1: /* TL-- */
+    KERNEL8x8x4
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_8x4_M8_TL1
+
+.L_dgemm_8x4_M8_TL1_END:
+    KERNEL8x8x4_END
+
+.L_dgemm_8x4_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_8x4_M8_L0
+.align 5
+.L_dgemm_8x4_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    xvldrepl.d U6,  B0, 0x10
+    xvfmadd.d  D8,  U0, U6, D8
+    xvfmadd.d  D9,  U1, U6, D9
+
+    xvldrepl.d U7,  B0, 0x18
+    xvfmadd.d  D12, U0, U7, D12
+    xvfmadd.d  D13, U1, U7, D13
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x20
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_8x4_M8_L71
+.L_dgemm_8x4_M8_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+
+    /* Load C2  */
+    xvld      U4,  C2,  0x00
+    xvld      U5,  C2,  0x20
+
+    /* Load C3  */
+    xvld      U6,  C3,  0x00
+    xvld      U7,  C3,  0x20
+
+    GSUB xvf, d, U0, U0, D0,  U1, U1, D1, \
+                 U2, U2, D4,  U3, U3, D5, \
+                 U4, U4, D8,  U5, U5, D9, \
+                 U6, U6, D12, U7, U7, D13
+.endm
+
+.macro dgemm_4x2
+.L_dgemm_4x2:
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x20
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */
+    beq       ZERO,TL, .L_dgemm_4x2_N3_M4_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_4x2_N3_M4_TL1_END
+.align 5
+.L_dgemm_4x2_N3_M4_TL1: /* TL-- */
+    KERNEL8x4x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_4x2_N3_M4_TL1
+.L_dgemm_4x2_N3_M4_TL1_END:
+    KERNEL8x4x2_END
+
+.L_dgemm_4x2_N3_M4_L7:
+    /* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_4x2_N3_M4_L0
+.align 5
+.L_dgemm_4x2_N3_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x2_N3_M4_L71
+
+.L_dgemm_4x2_N3_M4_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    GSUB xvf, d, U0, U0, D0, U1, U1, D4
+.endm
+
+.macro dgemm_2x2
+.L_dgemm_2x2:
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x10
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */
+    beq       ZERO,TL, .L_dgemm_2x2_N3_M2_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_2x2_N3_M2_TL1_END
+.align 5
+.L_dgemm_2x2_N3_M2_TL1: /* TL-- */
+    KERNEL8x2x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_2x2_N3_M2_TL1
+.L_dgemm_2x2_N3_M2_TL1_END:
+    KERNEL8x2x2_END
+
+.L_dgemm_2x2_N3_M2_L7:
+    /* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_2x2_N3_M2_L0
+.align 5
+.L_dgemm_2x2_N3_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_2x2_N3_M2_L71
+.L_dgemm_2x2_N3_M2_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    /* Load C1  */
+    xvld      U1,  C1,  0x00
+    GSUB xvf, d, U0, U0, D0, U1, U1, D4
+.endm
+
+.macro dgemm_8x2
+.L_dgemm_8x2:
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x40
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */
+    beq       ZERO,TL, .L_dgemm_8x2_N3_M8_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_8x2_N3_M8_TL1_END
+.align 5
+.L_dgemm_8x2_N3_M8_TL1: /* TL-- */
+    KERNEL8x8x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_8x2_N3_M8_TL1
+.L_dgemm_8x2_N3_M8_TL1_END:
+    KERNEL8x8x2_END
+
+.L_dgemm_8x2_N3_M8_L7:
+    /* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_8x2_N3_M8_L0
+.align 5
+.L_dgemm_8x2_N3_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_8x2_N3_M8_L71
+
+.L_dgemm_8x2_N3_M8_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    /* Load C1  */
+    xvld      U2,  C1,  0x00
+    xvld      U3,  C1,  0x20
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5
+.endm
+
+.macro dgemm_16x2
+.L_dgemm_16x2:
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    xvldrepl.d     U5, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U5
+    xvfmul.d  D5,  U1, U5
+    xvfmul.d  D6,  U2, U5
+    xvfmul.d  D7,  U3, U5
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x80
+    PTR_ADDI    B0,  B0, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_L7 */
+    beq       ZERO,TL, .L_dgemm_16x2_N3_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x10
+
+    beq    ZERO,    TL,  .L_dgemm_16x2_N3_TL1_END
+.align 5
+.L_dgemm_16x2_N3_TL1: /* TL-- */
+    KERNEL8x16x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_16x2_N3_TL1
+.L_dgemm_16x2_N3_TL1_END:
+    KERNEL8x16x2_END
+
+.L_dgemm_16x2_N3_L7:
+    /* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_16x2_N3_L0
+.align 5
+.L_dgemm_16x2_N3_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+    xvfmadd.d  D5,  U1, U5, D5
+    xvfmadd.d  D6,  U2, U5, D6
+    xvfmadd.d  D7,  U3, U5, D7
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_16x2_N3_L71
+
+.L_dgemm_16x2_N3_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    /* Load C1  */
+    xvld      U4,  C1,  0x00
+    xvld      U5,  C1,  0x20
+    xvld      U6,  C1,  0x40
+    xvld      U7,  C1,  0x60
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \
+                 U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7
+.endm
+
+.macro dgemm_2x1
+.L_dgemm_2x1:
+    /* Load 2 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x10
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */
+    beq       ZERO,TL, .L_dgemm_2x1_N1_M2_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_2x1_N1_M2_TL1_END
+.align 5
+.L_dgemm_2x1_N1_M2_TL1: /* TL-- */
+    KERNEL8x2x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_2x1_N1_M2_TL1
+.L_dgemm_2x1_N1_M2_TL1_END:
+    KERNEL8x2x1_END
+
+.L_dgemm_2x1_N1_M2_L7:
+    /* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_2x1_N1_M2_L0
+.align 5
+.L_dgemm_2x1_N1_M2_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x10
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_2x1_N1_M2_L71
+.L_dgemm_2x1_N1_M2_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_4x1
+.L_dgemm_4x1:
+    /* Load 4 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x20
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */
+    beq       ZERO,TL, .L_dgemm_4x1_N1_M4_L7
+
+    xvld     U8,   A0,    0x00
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_4x1_N1_M4_TL1_END
+.align 5
+.L_dgemm_4x1_N1_M4_TL1: /* TL-- */
+    KERNEL8x4x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_4x1_N1_M4_TL1
+.L_dgemm_4x1_N1_M4_TL1_END:
+    KERNEL8x4x1_END
+
+.L_dgemm_4x1_N1_M4_L7:
+    /* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_4x1_N1_M4_L0
+.align 5
+.L_dgemm_4x1_N1_M4_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x20
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_4x1_N1_M4_L71
+.L_dgemm_4x1_N1_M4_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_8x1
+.L_dgemm_8x1:
+    /* Load 8 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x40
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */
+    beq       ZERO,TL, .L_dgemm_8x1_N1_M8_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_8x1_N1_M8_TL1_END
+.align 5
+.L_dgemm_8x1_N1_M8_TL1: /* TL-- */
+    KERNEL8x8x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_8x1_N1_M8_TL1
+
+.L_dgemm_8x1_N1_M8_TL1_END:
+    KERNEL8x8x1_END
+
+.L_dgemm_8x1_N1_M8_L7:
+    /* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_8x1_N1_M8_L0
+.align 5
+.L_dgemm_8x1_N1_M8_L71:
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x40
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_8x1_N1_M8_L71
+.L_dgemm_8x1_N1_M8_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1
+.endm
+
+.macro dgemm_16x1
+.L_dgemm_16x1:
+    /* Load 16 * 64 from A0
+     * U0 = {a3,  a2,  a1,  a0}
+     * U1 = {a7,  a6,  a5,  a4}
+     * U2 = {a11, a10, a9,  a8}
+     * U3 = {a15, a14, a13, a12}
+     */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+    xvfmul.d  D1,  U1, U4
+    xvfmul.d  D2,  U2, U4
+    xvfmul.d  D3,  U3, U4
+
+    /* Add stride for A0 and B0 */
+    PTR_ADDI    A0,  A0, 0x80
+    PTR_ADDI    B0,  B0, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_dgemm_16x1_N1_L7 */
+    beq       ZERO,TL, .L_dgemm_16x1_N1_L7
+
+    xvld     U8,   A0,    0x00
+    xvld     U9,   A0,    0x20
+    xvld     U10,  A0,    0x40
+    xvld     U11,  A0,    0x60
+
+    PTR_ADDI    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_dgemm_16x1_N1_TL1_END
+.align 5
+.L_dgemm_16x1_N1_TL1: /* TL-- */
+    KERNEL8x16x1
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_16x1_N1_TL1
+.L_dgemm_16x1_N1_TL1_END:
+    KERNEL8x16x1_END
+
+.L_dgemm_16x1_N1_L7:
+    /* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_16x1_N1_L0
+.align 5
+.L_dgemm_16x1_N1_L71:
+    /* Load 16 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+    xvld     U1,   A0,    0x20
+    xvld     U2,   A0,    0x40
+    xvld     U3,   A0,    0x60
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+    xvfmadd.d  D1,  U1, U4, D1
+    xvfmadd.d  D2,  U2, U4, D2
+    xvfmadd.d  D3,  U3, U4, D3
+
+    /* Add stride for A0, B0 */
+    PTR_ADDI     A0,  A0, 0x80
+    PTR_ADDI     B0,  B0, 0x08
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_16x1_N1_L71
+.L_dgemm_16x1_N1_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C0,  0x20
+    xvld      U2,  C0,  0x40
+    xvld      U3,  C0,  0x60
+    GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3
+.endm
+
+.macro dgemm_1x2
+.L_dgemm_1x2: // See dgemm_kernel_16x4.S
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    xvldrepl.d     U4, B0, 0x08
+    /* line 2 */
+    xvfmul.d  D4,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x10
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N3_M1_L7 */
+    beq       ZERO,TL, .L_dgemm_1x2_N3_M1_L7
+
+    xvld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    xvldrepl.d  U13,   B0,  0x08
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+    beq    ZERO,    TL,  .L_dgemm_1x2_N3_M1_TL1_END
+.L_dgemm_1x2_N3_M1_TL1: /* TL-- */
+    KERNEL8x1x2
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_dgemm_1x2_N3_M1_TL1
+.L_dgemm_1x2_N3_M1_TL1_END:
+    KERNEL8x1x2_END
+.L_dgemm_1x2_N3_M1_L7:
+    /* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_dgemm_1x2_N3_M1_L0
+.L_dgemm_1x2_N3_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    xvldrepl.d U5,  B0, 0x08
+    xvfmadd.d  D4,  U0, U5, D4
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x10
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_dgemm_1x2_N3_M1_L71
+.L_dgemm_1x2_N3_M1_L0:
+    xvld      U0,  C0,  0x00
+    xvld      U1,  C1,  0x00
+    xvinsve0.d  U0,     U1,     0x01
+    xvinsve0.d  D0,     D4,     0x01
+    GSUB xvf, d, U0, U0, D0
+.endm
+
+.macro dgemm_1x1
+.L_dgemm_1x1:
+    /* Load 1 * 64 from A0 */
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d     U4, B0, 0x00
+    /* line 1 */
+    xvfmul.d  D0,  U0, U4
+
+    /* Add stride for A0 and B0 */
+    addi.d    A0,  A0, 0x08
+    addi.d    B0,  B0, 0x08
+    /* Reduce L */
+    addi.d    L,   L,  -1
+    srai.d    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+
+    xvld     U8,   A0,    0x00
+
+    addi.d    TL,  TL,  -1
+
+    xvldrepl.d  U12,   B0,  0x00
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    beq    ZERO,    TL,  .L_N1_M1_TL1_END
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8x1x1
+    addi.d    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_TL1_END:
+    KERNEL8x1x1_END
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+
+.L_N1_M1_L71:
+    xvld     U0,   A0,    0x00
+
+    xvldrepl.d U4,  B0, 0x00
+    xvfmadd.d  D0,  U0, U4, D0
+
+    /* Add stride for A0, B0 */
+    addi.d     A0,  A0, 0x08
+    addi.d     B0,  B0, 0x08
+
+    addi.d     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+.L_N1_M1_L0:
+    /* Load C0  */
+    xvld      U0,  C0,  0x00
+    GSUB xvf, d, U0, U0, D0
+.endm
diff --git a/kernel/loongarch64/iamax_lasx.S b/kernel/loongarch64/iamax_lasx.S
new file mode 100644
index 000000000..090da3004
--- /dev/null
+++ b/kernel/loongarch64/iamax_lasx.S
@@ -0,0 +1,542 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfmaxa.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmaxa.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+#else
+    xvld VX0, X, 0 * SIZE
+    addi.d I, I, -1
+    xvadd.w VI1, VI1, VINC8
+    xvfmaxa.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+#endif
+    XVFMAXA VM1, x1, x2
+    XVCMPEQ VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    XVFMAXA VM0, x3, x4
+    XVCMPEQ VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    XVFMAXA VM0, VM0, VM1
+    XVCMPEQ VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    CMPEQ $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfmaxa.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmaxa.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    xvfmaxa.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM1, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.d VM0, x4, x3
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmaxa.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfmaxa.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    CMPEQ $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT  VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT  VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT  VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+#else
+    fmov.s $f16, $f20
+#endif
+    .align 3
+
+#ifdef DOUBLE
+
+#else
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfmaxa.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    xvfmaxa.s VM0, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VX0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    movfr2gr.s i0, $f20
+#endif
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD  $f9, X, 0
+    addi.d I, I, -1
+    XVFMAXA VM1, x1, VM0
+    XVCMPEQ VT0, VM0, VM1
+    add.d  X, X, INCX
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG     i0,   $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/iamax_lsx.S b/kernel/loongarch64/iamax_lsx.S
new file mode 100644
index 000000000..ce5b3c724
--- /dev/null
+++ b/kernel/loongarch64/iamax_lsx.S
@@ -0,0 +1,482 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC2 $vr17
+#define VINC4 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC2, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmaxa.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    vbitsel.v x2, x4, x2, VT0
+    vfmaxa.d VM1, VM0, x3
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, x2, VI0, VT0
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vadd.w VI1, VI1, VINC4
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC2
+    vfmaxa.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    addi.d I, I, -1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmaxa.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L16
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L17
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC2, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC4, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC4, VINC2, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+#ifdef DOUBLE
+.L16:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L17:
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L11: //INCX==1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L14
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L13:
+    fld.d $f9, X, 0
+    vfmaxa.d VM1, x1, VM0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d I, I, -1
+    addi.d i1, i1, 1
+    addi.d  X, X, SIZE
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L13
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L14:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+    .align 3
+
+.L24:
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmaxa.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmaxa.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    vbitsel.v x2, x4, x2, VT0
+    vfmaxa.d VM1, VM0, x3
+    vbitsel.v VM0, VM1, VM0, VT0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VI0, x2, VI0, VT0
+    addi.d I, I, -1
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+    .align 3
+
+.L26:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+
+#else
+.L20: // INCX!=1
+    move TEMP, X
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC2, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+    .align 3
+
+.L24:
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC4
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC2
+    vfmaxa.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmaxa.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC2, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC4, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC4, VINC2, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+#endif
+.L21: // N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD $f9, X, 0
+    VFMAXA VM1, x1, VM0
+    VCMPEQ VT0, VM0, VM1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d I, I, -1
+    addi.d i1, i1, 1
+    add.d  X, X, INCX
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG   i0,  $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/iamin_lasx.S b/kernel/loongarch64/iamin_lasx.S
new file mode 100644
index 000000000..6ea117907
--- /dev/null
+++ b/kernel/loongarch64/iamin_lasx.S
@@ -0,0 +1,486 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfmina.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    addi.d I, I, -1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmina.d VM1, VM0, VM1
+#else
+    addi.d I, I, -1
+    xvadd.w VI2, VI1, VINC8
+    xvfmina.s VM1, VX0, VM0
+#endif
+    XVCMPEQ VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, x1, VM1
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v  VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v  VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfmina.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+#endif
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    XVFMINA VM0, x4, x3
+    XVCMPEQ VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    XVFMINA VM0, VM0, VM1
+    XVCMPEQ VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d  i0,   i0, 1
+    ld.d    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w  i0, i0, 1
+    ld.w    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI2, VI1, VINC4
+    xvfmina.d VM1, VX0, VX1
+    xvfcmp.ceq.d VT0, VX0, VM1
+    xvbitsel.v VI2, VI2, VI1, VT0
+    xvfmina.d VM1, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI2, VI1, VINC8
+    xvfmina.s VM1, VX0, VM0
+    xvfcmp.ceq.s VT0, VM1, VM0
+#endif
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI2, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+    .align 3
+#else
+    fmov.s $f16, $f20
+    .align 3
+
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfmina.s VM1, x1, x2
+    xvfcmp.ceq.s VT0, x1, VM1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.s VM0, x3, x4
+    xvfcmp.ceq.s VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmina.s VM0, VM0, VM1
+    xvfcmp.ceq.s VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    xvfmina.s VM0, VX0, VM0
+    xvfcmp.ceq.s VT0, VM0, VX0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    movfr2gr.s i0, $f20
+
+#endif
+
+.L21: // N<8
+    andi    I, N, 7
+    bge   $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD $f9, X, 0
+    addi.d I, I, -1
+    XVFMINA    VM1,  x1, VM0
+    XVCMPEQ    VT0, VM0, VM1
+    add.d        X,   X, INCX
+    xvbitsel.v VM0, VM1, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/iamin_lsx.S b/kernel/loongarch64/iamin_lsx.S
new file mode 100644
index 000000000..ce885fd88
--- /dev/null
+++ b/kernel/loongarch64/iamin_lsx.S
@@ -0,0 +1,446 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#ifdef DOUBLE
+#define VINC2 $vr17
+#define VINC4 $vr18
+#else
+#define VINC4 $vr17
+#define VINC8 $vr18
+#endif
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d  i0, i0, 1
+    srai.d   I, N, 3
+    bge $r0, I, .L21
+
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d  I, N, 3
+    bge   $r0, I, .L21
+
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, VINC2
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmina.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    addi.d I, I, -1
+    vbitsel.v x2, x4, x2, VT0
+    vfmina.d VM1, VM0, x3
+#else
+    vadd.w VI1, VI1, VINC8
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, VINC4
+    vfmina.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    addi.d I, I, -1
+    vbitsel.v x2, VI2, VI1, VT0
+    vfmina.s VM1, VM0, VM1
+#endif
+    VCMPEQ VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, x2, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v   VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w  x1, VM0, 0
+    vreplvei.w  x2, VM0, 1
+    vreplvei.w  x3, VM0, 2
+    vreplvei.w  x4, VM0, 3
+    vfmina.s   VM1, x1, x2
+    vfcmp.ceq.s  VT0, VM1, x1
+    vbitsel.v  VINC4, VI2, VI1, VT0
+    vfmina.s     VM0,  x3, x4
+    vfcmp.ceq.s  VT0,  x3, VM0
+    vbitsel.v  VINC8, VI4, VI3, VT0
+    vfmina.s     VM0, VM0, VM1
+    vfcmp.ceq.s  VT0, VM0, VM1
+    vbitsel.v    VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v   VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d    i0,   i0, 1
+    ld.d      t1, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+    vinsgr2vr.d    VM0, t1, 0
+    srai.d     I,    N, 3
+    bge $r0,   I, .L21
+    ld.d      t2, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC2, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w    i0,   i0, 1
+    ld.w      t1, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+    vinsgr2vr.w    VM0, t1, 0
+    srai.d     I,    N, 3
+    bge $r0,   I, .L21
+    ld.w      t2, TEMP, 0 * SIZE
+    add.d   TEMP, TEMP, INCX
+
+    vreplvei.d VI1, VI0, 0
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d  t1, X, 0 * SIZE
+    add.d X,  X, INCX
+    ld.d  t2, X, 0 * SIZE
+    add.d X,  X, INCX
+
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x1, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x1
+    vbitsel.v x2, VI2, VI1, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, VINC2
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, VINC2
+    vfmina.d x3, VX0, VX1
+    vfcmp.ceq.d VT0, VX0, x3
+    vbitsel.v x4, VI2, VI1, VT0
+    vfmina.d x3, x1, x3
+    vfcmp.ceq.d VT0, x1, x3
+    addi.d I, I, -1
+    vbitsel.v x2, x4, x2, VT0
+    vfmina.d VM1, VM0, x3
+    vbitsel.v VM0, VM1, VM0, VT0
+    vfcmp.ceq.d VT0, VM0, VM1
+    vbitsel.v VI0, x2, VI0, VT0
+#else
+    ld.w  t1, X, 0 * SIZE
+    add.d X,  X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X,  X, INCX
+
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, VINC4
+    vfmina.s VM1, VX0, VX1
+    vfcmp.ceq.s VT0, VX0, VM1
+    vbitsel.v VI2, VI2, VI1, VT0
+    vfmina.s VM1, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI2, VI0, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    vfmina.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+#endif
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD $f9, X, 0
+    addi.d I, I, -1
+    VFMINA VM1, x1, VM0
+    VCMPEQ VT0, VM0, VM1
+    add.d  X, X, INCX
+    vbitsel.v VM0, VM1, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    addi.d i1, i1, 1
+    MTC $f21, i1
+    blt $r0, I, .L22
+    movfr2gr.s i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamax_lasx.S b/kernel/loongarch64/icamax_lasx.S
new file mode 100644
index 000000000..7800cb917
--- /dev/null
+++ b/kernel/loongarch64/icamax_lasx.S
@@ -0,0 +1,562 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    xvxor.v VM0, VM0, VM0
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    xvxor.v VI3, VI3, VI3 // 0
+#ifdef DOUBLE
+    li.d I, -1
+    xvreplgr2vr.d VI4, I
+    xvffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    li.w I, -1
+    xvreplgr2vr.w VI4, I
+    xvffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC4
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VI4, x1
+    xvfmul.d x4, VI4, x2
+    xvfcmp.clt.d VT0, x1, VI3
+    xvfcmp.clt.d VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+#else
+    xvadd.w VI1, VI1, VINC8
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VI4, x1
+    xvfmul.s x4, VI4, x2
+    xvfcmp.clt.s VT0, x1, VI3
+    xvfcmp.clt.s VINC4, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC4
+#endif
+    XVFADD  x1,  x1,  x2
+    XVFMAX  x3,  VM0, x1
+    XVCMPEQ VT0, x3,  VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmax.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmax.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmax.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    xvadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    addi.d I, I, -1
+    xvfmul.d x3, VI4, x1
+    xvfmul.d x4, VI4, x2
+    xvfcmp.clt.d VT0, x1, VI3
+    xvfcmp.clt.d VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+    xvfadd.d x1, x1, x2
+    xvfmax.d x3, VM0, x1
+    xvfcmp.ceq.d VT0, x3, VM0
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VI4, x1
+    xvfmul.s x4, VI4, x2
+    xvfcmp.clt.s VT0, x1, VI3
+    xvfcmp.clt.s VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+    xvfadd.s x1, x1, x2
+    xvfmax.s x3, VM0, x1
+    xvfcmp.ceq.s VT0, x3, VM0
+#endif
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmaxa.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmaxa.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmaxa.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT  VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT  VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT  VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+#else
+    fmov.s $f16, $f20
+#endif
+    .align 3
+
+#ifdef DOUBLE
+#else
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, x1
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f15, $f13
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+
+#endif
+.L21: //N<8
+#ifdef DOUBLE
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+#else
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+#endif
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMAX a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamax_lsx.S b/kernel/loongarch64/icamax_lsx.S
new file mode 100644
index 000000000..a2fc9dbbd
--- /dev/null
+++ b/kernel/loongarch64/icamax_lsx.S
@@ -0,0 +1,434 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    vxor.v VM0, VM0, VM0
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    vxor.v VI3, VI3, VI3 // 0
+#ifdef DOUBLE
+    li.d I, -1
+    vreplgr2vr.d VI4, I
+    vffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    li.w I, -1
+    vreplgr2vr.w VI4, I
+    vffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmax.d x3, VM0, x1
+    vfcmp.ceq.d VT0, x3, VM0
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+#else
+    vadd.w VI1, VI1, VINC4
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VI4, x1
+    vfmul.s x4, VI4, x2
+#endif
+    VCMPLT  VT0, x1, VI3
+    VCMPLT  VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    VFADD  x1, x1, x2
+    VFMAX  x3, VM0, x1
+    VCMPEQ VT0, x3, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmax.d x3, VM0, x1
+    ld.d t1, X, 0 * SIZE
+    vfcmp.ceq.d VT0, x3, VM0
+    ld.d t2, X, 1 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+    addi.d I, I, -1
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmax.d x3, VM0, x1
+    vfcmp.ceq.d VT0, x3, VM0
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vadd.w VI1, VI1, VINC4
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmul.s x3, VI4, x1
+    vfmul.s x4, VI4, x2
+    vfcmp.clt.s VT0, x1, VI3
+    vfcmp.clt.s VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.s x1, x1, x2
+    vfmax.s x3, VM0, x1
+    vfcmp.ceq.s VT0, x3, VM0
+#endif
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmaxa.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmaxa.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmaxa.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+#ifdef DOUBLE
+.L26:
+    vfmaxa.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+    vbitsel.v VI0, VI2, VI1, VT0
+    .align 3
+
+.L27:
+    movfr2gr.d i0, $f20
+    .align 3
+#else
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+#endif
+.L21: //N<4
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMAX a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG  i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamin_lasx.S b/kernel/loongarch64/icamin_lasx.S
new file mode 100644
index 000000000..01abd45b2
--- /dev/null
+++ b/kernel/loongarch64/icamin_lasx.S
@@ -0,0 +1,555 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    LD   a0, X, 0 * SIZE
+    LD   a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD  s1, a1, a0
+#ifdef DOUBLE
+    xvreplve0.d VM0, VM0
+    xvxor.v VI3, VI3, VI3 // 0
+    li.d I, -1
+    xvreplgr2vr.d VI4, I
+    xvffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    xvreplve0.w VM0, VM0
+    xvxor.v VI3, VI3, VI3 // 0
+    li.w I, -1
+    xvreplgr2vr.w VI4, I
+    xvffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC4
+    xvld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmul.d x3, VI4, x1
+    xvfmul.d x4, VI4, x2
+    xvfcmp.clt.d VT0, x1, VI3
+    xvfcmp.clt.d VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+#else
+    xvadd.w VI1, VI1, VINC8
+    xvld VX1, X, 8 * SIZE
+    addi.d I, I, -1
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmul.s x3, VI4, x1
+    xvfmul.s x4, VI4, x2
+    xvfcmp.clt.s VT0, x1, VI3
+    xvfcmp.clt.s VINC4, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC4
+#endif
+    XVFADD  x1, x1, x2
+    XVFMIN  x3, VM0, x1
+    XVCMPEQ VT0, x3, VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmin.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmin.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmin.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+#endif
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -7
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 1 //3
+    addi.d i0, i0, -1
+    xvinsgr2vr.d VI0, i0, 2 //2
+    addi.d i0, i0, 2
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 2 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //6
+    addi.w i0, i0, -3
+    xvinsgr2vr.w VI0, i0, 4 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //4
+    addi.w i0, i0, 3
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    xvadd.d VI1, VI1, VINC4
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvadd.w VI1, VI1, VINC8
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    addi.d I, I, -1
+    XVFMUL x3, VI4, x1
+    XVFMUL x4, VI4, x2
+    XVCMPLT VT0, x1, VI3
+    XVCMPLT VINC8, x2, VI3
+    xvbitsel.v x1, x1, x3, VT0
+    xvbitsel.v x2, x2, x4, VINC8
+    XVFADD x1, x1, x2
+    XVFMIN x3, VM0, x1
+    XVCMPEQ VT0, x3, VM0
+    xvbitsel.v VM0, x3, VM0, VT0
+    xvbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfmina.d VM1, x1, x2
+    xvfcmp.ceq.d VT0, VM1, x1
+    xvbitsel.v VINC4, VI2, VI1, VT0
+    xvfmina.d VM0, x3, x4
+    xvfcmp.ceq.d VT0, x3, VM0
+    xvbitsel.v VINC8, VI4, VI3, VT0
+    xvfmina.d VM0, VM0, VM1
+    xvfcmp.ceq.d VT0, VM0, VM1
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+#endif
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT  VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT  VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT  VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT  VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+    .align 3
+
+.L21: //N<4
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+#else
+    fmov.s $f16, $f20
+    .align 3
+
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, x1
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f15, $f13
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+#endif
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMIN a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG  i0, $f20
+    .align 3
+
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/icamin_lsx.S b/kernel/loongarch64/icamin_lsx.S
new file mode 100644
index 000000000..a08cd33c5
--- /dev/null
+++ b/kernel/loongarch64/icamin_lsx.S
@@ -0,0 +1,425 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define a0 $f12
+#define a1 $f13
+#define s1 $f15
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VINC4 $vr17
+#define VINC8 $vr18
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    LD   a0, X, 0 * SIZE
+    LD   a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD  s1, a1, a0
+    vreplvei.w VM0, VM0, 0
+    vxor.v VI3, VI3, VI3 // 0
+#ifdef DOUBLE
+    li.d I, -1
+    vreplgr2vr.d VI4, I
+    vffint.d.l VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    li.w I, -1
+    vreplgr2vr.w VI4, I
+    vffint.s.w VI4, VI4 // -1
+    bne INCX, TEMP, .L20
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 2 * SIZE
+    addi.d I, I, -1
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmin.d x3, VM0, x1
+    vfcmp.ceq.d VT0, x3, VM0
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI1, VINC4
+    vld VX1, X, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vadd.w VI1, VI1, VINC4
+    vld VX1, X, 4 * SIZE
+    addi.d I, I, -1
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFMUL  x3, VI4, x1
+    VFMUL  x4, VI4, x2
+    VCMPLT VT0, x1, VI3
+    VCMPLT VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    VFADD  x1, x1, x2
+    VFMIN  x3, VM0, x1
+    VCMPEQ VT0, x3, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d VINC4, i0
+    addi.d i0, i0, -3
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 2
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w VINC4, i0
+    addi.w i0, i0, -7
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+    vfmul.d x3, VI4, x1
+    vfmul.d x4, VI4, x2
+    vfcmp.clt.d VT0, x1, VI3
+    vfcmp.clt.d VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    vfadd.d x1, x1, x2
+    vfmin.d x3, VM0, x1
+    ld.d t1, X, 0 * SIZE
+    vfcmp.ceq.d VT0, x3, VM0
+    ld.d t2, X, 1 * SIZE
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    vadd.d VI1, VI1, VINC4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vadd.w VI1, VI1, VINC4
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    addi.d I, I, -1
+    VFMUL  x3, VI4, x1
+    VFMUL  x4, VI4, x2
+    VCMPLT VT0, x1, VI3
+    VCMPLT VINC8, x2, VI3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VINC8
+    VFADD  x1, x1, x2
+    VFMIN  x3, VM0, x1
+    VCMPEQ VT0, x3, VM0
+    vbitsel.v VM0, x3, VM0, VT0
+    vbitsel.v VI0, VI1, VI0, VT0
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.d VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    vfmina.s VM1, x1, x2
+    vfcmp.ceq.s VT0, VM1, x1
+    vbitsel.v VINC4, VI2, VI1, VT0
+    vfmina.s VM0, x3, x4
+    vfcmp.ceq.s VT0, x3, VM0
+    vbitsel.v VINC8, VI4, VI3, VT0
+    vfmina.s VM0, VM0, VM1
+    vfcmp.ceq.s VT0, VM0, VM1
+    vbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    vfmina.d VM0, x1, x2
+    vfcmp.ceq.d VT0, x1, VM0
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    vfcmp.clt.s VT0, VI2, VI0
+#endif
+    vbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+    .align 3
+#else
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    vfcmp.clt.s VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    vfcmp.clt.s VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+    movfr2gr.s i0, $f20
+    .align 3
+
+#endif
+.L21: //N<4
+    andi I, N, 3
+    bge $r0, I, .L999
+    srai.d i1, N, 2
+    slli.d i1, i1, 2
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    FMIN a1, s1, a0
+    CMPEQ $fcc0, s1, a1
+    add.d  X, X, INCX
+    fsel s1, a1, s1, $fcc0
+    fsel $f20, $f21, $f20, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG  i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/imax_lasx.S b/kernel/loongarch64/imax_lasx.S
new file mode 100644
index 000000000..2d3d5e9d3
--- /dev/null
+++ b/kernel/loongarch64/imax_lasx.S
@@ -0,0 +1,533 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX0, VX1
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
+    xvadd.w VI1, VI1, VINC8
+    xvfcmp.clt.s VT0, VM0, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    addi.d X, X, 8 * SIZE
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    xvpickve.d x3, VM0, 2
+    xvpickve.d x4, VM0, 3
+    xvfcmp.clt.d VT0, x1, x2
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    xvfcmp.clt.s VT0, x1, x2
+#endif
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+#else
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+#endif
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+#ifdef DOUBLE
+    xvinsgr2vr.d VM0, t1, 0
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    xvadd.d VI2, VI1, VINC4
+    xvfcmp.clt.d VT0, VX0, VX1
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    xvfcmp.clt.d VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    xvfcmp.clt.s VT0, VM0, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d  x1, VM0, 0
+    xvpickve.d  x2, VM0, 1
+    xvpickve.d  x3, VM0, 2
+    xvpickve.d  x4, VM0, 3
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w  x1, VM0, 0
+    xvpickve.w  x2, VM0, 1
+    xvpickve.w  x3, VM0, 2
+    xvpickve.w  x4, VM0, 3
+#endif
+    XVCMPLT VT0, x1, x2
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, VM0, VM1
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    movfr2gr.d i0, $f20
+#else
+    fmov.s $f16, $f20
+#endif
+    .align 3
+
+#ifndef DOUBLE
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    xvfcmp.clt.s VT0, x1, x2
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    xvfcmp.clt.s VT0, x3, x4
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    xvfcmp.clt.s VT0, VM0, x1
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L262
+    xvfcmp.clt.s VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L272
+    xvfcmp.clt.s VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L282
+    xvfcmp.clt.s VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L292
+    xvfcmp.clt.s VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    fcmp.clt.s $fcc0, $f15, $f13
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    movfr2gr.s i0, $f20
+#endif
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f15, $f9
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/imax_lsx.S b/kernel/loongarch64/imax_lsx.S
new file mode 100644
index 000000000..92556d4e6
--- /dev/null
+++ b/kernel/loongarch64/imax_lsx.S
@@ -0,0 +1,428 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d  i0, i0, 1
+    srai.d   I,  N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w  i0, i0, 1
+    srai.d   I,  N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, $vr18
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, $vr17
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x1, x3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, VM0, x1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    vadd.w VI1, VI1, $vr18
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM0, VM1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w  x1, VM0, 0
+    vreplvei.w  x2, VM0, 1
+    vreplvei.w  x3, VM0, 2
+    vreplvei.w  x4, VM0, 3
+    VCMPLT VT0, x1, x2
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    VCMPLT VT0, x3, x4
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    VCMPLT VT0, VM0, VM1
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d  i0,   i0, 1
+    ld.d    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d  VM0, t1, 0
+    srai.d         I,  N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w  i0,   i0, 1
+    ld.w    t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w  VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, $vr18
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, $vr17
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x1, x3
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, VM0, x1
+    addi.d I, I, -1
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, $vr18
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX0, VX1
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM0, VM1
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d  x1, VM0, 0
+    vreplvei.d  x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w  x1, VM0, 0
+    vreplvei.w  x2, VM0, 1
+    vreplvei.w  x3, VM0, 2
+    vreplvei.w  x4, VM0, 3
+    vfcmp.clt.s VT0,  x1, x2
+    vbitsel.v   VM1,  x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    vfcmp.clt.s VT0,  x3, x4
+    vbitsel.v   VM0,  x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    vfcmp.clt.s VT0, VM0, VM1
+    vbitsel.v VM0,   VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    vfcmp.clt.s VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    VCMPLT    VT0,  x1, x2
+    vbitsel.v VM0,  x1, x2, VT0
+    vbitsel.v VI0, VI1, VI2, VT0
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    VCMPLT VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    .align 3
+
+.L27:
+#ifndef DOUBLE
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    VCMPLT VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    VCMPLT VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#endif
+    MTG i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f15, $f9
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/imin_lasx.S b/kernel/loongarch64/imin_lasx.S
new file mode 100644
index 000000000..5306828e2
--- /dev/null
+++ b/kernel/loongarch64/imin_lasx.S
@@ -0,0 +1,534 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $xr9
+#define x2 $xr10
+#define x3 $xr11
+#define x4 $xr12
+#define VX0 $xr13
+#define VX1 $xr14
+#define VM0 $xr15
+#define VM1 $xr16
+#define VINC4 $xr17
+#define VINC8 $xr18
+#define VI0 $xr20
+#define VI1 $xr21
+#define VI2 $xr22
+#define VI3 $xr8
+#define VI4 $xr19
+#define VT0 $xr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    xvld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvadd.d VI1, VI1, VINC8
+    xvld VX1, X, 4 * SIZE
+    xvadd.d VI2, VI1, VINC4
+    XVCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    XVCMPLT VT0, VM1, VM0
+    addi.d X, X, 8 * SIZE
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
+    xvadd.w VI1, VI1, VINC8
+    XVCMPLT VT0, VX0, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    addi.d X, X, 8 * SIZE
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d  x1, VM0, 0
+    xvpickve.d  x2, VM0, 1
+    xvpickve.d  x3, VM0, 2
+    xvpickve.d  x4, VM0, 3
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w  x1, VM0, 0
+    xvpickve.w  x2, VM0, 1
+    xvpickve.w  x3, VM0, 2
+    xvpickve.w  x4, VM0, 3
+#endif
+    XVCMPLT VT0, x2, x1
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.d t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.d VM0, t1, 0
+    xvinsgr2vr.d VM0, t2, 1
+    xvinsgr2vr.d VM0, t3, 2
+    xvinsgr2vr.d VM0, t4, 3
+    slli.d i0, i0, 2 //4
+    xvreplgr2vr.d VINC4, i0
+    slli.d i0, i0, 1 //8
+    xvreplgr2vr.d VINC8, i0
+    addi.d i0, i0, -15
+    xvinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI1, i0, 3
+    addi.d i0, i0, 5
+    xvinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 1 //2
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 2 //3
+    addi.d i0, i0, 1
+    xvinsgr2vr.d VI0, i0, 3 //4
+#else
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 0
+    xvinsgr2vr.w VM0, t2, 1
+    xvinsgr2vr.w VM0, t3, 2
+    xvinsgr2vr.w VM0, t4, 3
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    xvinsgr2vr.w VM0, t1, 4
+    xvinsgr2vr.w VM0, t2, 5
+    xvinsgr2vr.w VM0, t3, 6
+    xvinsgr2vr.w VM0, t4, 7
+    slli.w i0, i0, 3 //8
+    xvreplgr2vr.w VINC8, i0
+    addi.w i0, i0, -15
+    xvinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI1, i0, 7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 3 //4
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 4 //5
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 5 //6
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 6 //7
+    addi.w i0, i0, 1
+    xvinsgr2vr.w VI0, i0, 7 //8
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvadd.d VI1, VI1, VINC8
+    xvadd.d VI2, VI1, VINC4
+    XVCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    xvbitsel.v VM1, VX0, VX1, VT0
+    xvbitsel.v VI2, VI1, VI2, VT0
+    XVCMPLT VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VI0, VI2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvadd.w VI1, VI1, VINC8
+    XVCMPLT VT0, VX0, VM0
+    addi.d I, I, -1
+    xvbitsel.v VM0, VM0, VX0, VT0
+    xvbitsel.v VI0, VI0, VI1, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    xvpickve.d VI1, VI0, 0
+    xvpickve.d VI2, VI0, 1
+    xvpickve.d VI3, VI0, 2
+    xvpickve.d VI4, VI0, 3
+    xvpickve.d x1,  VM0, 0
+    xvpickve.d x2,  VM0, 1
+    xvpickve.d x3,  VM0, 2
+    xvpickve.d x4,  VM0, 3
+#else
+    xvxor.v VX0, VX0, VX0
+    xvor.v  VX0, VI0, VX0
+    xvxor.v VX1, VX1, VX1
+    xvor.v  VX1, VM0, VX1
+    xvpickve.w VI1, VI0, 0
+    xvpickve.w VI2, VI0, 1
+    xvpickve.w VI3, VI0, 2
+    xvpickve.w VI4, VI0, 3
+    xvpickve.w  x1, VM0, 0
+    xvpickve.w  x2, VM0, 1
+    xvpickve.w  x3, VM0, 2
+    xvpickve.w  x4, VM0, 3
+#endif
+    XVCMPLT    VT0, x2, x1
+    xvbitsel.v VM1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT    VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, VM1, VM0
+    xvbitsel.v VM0, VM0, VM1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L26:
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    XVCMPLT VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L27:
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    XVCMPLT VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    XVCMPLT VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#ifdef DOUBLE
+    MTG i0, $f20
+#else
+    fmov.s $f16, $f20
+#endif
+    .align 3
+
+#ifndef DOUBLE
+.L252:
+    xvxor.v VI0, VI0, VI0
+    xvor.v VI0, VI0, VX0
+    fmov.s $f13, $f15
+    xvxor.v VM0, VM0, VM0
+    xvor.v VM0, VM0, VX1
+    xvpickve.w VI1, VI0, 4
+    xvpickve.w VI2, VI0, 5
+    xvpickve.w VI3, VI0, 6
+    xvpickve.w VI4, VI0, 7
+    xvpickve.w x1, VM0, 4
+    xvpickve.w x2, VM0, 5
+    xvpickve.w x3, VM0, 6
+    xvpickve.w x4, VM0, 7
+    XVCMPLT VT0, x2, x1
+    xvbitsel.v x1, x1, x2, VT0
+    xvbitsel.v VINC4, VI1, VI2, VT0
+    XVCMPLT VT0, x4, x3
+    xvbitsel.v VM0, x3, x4, VT0
+    xvbitsel.v VINC8, VI3, VI4, VT0
+    XVCMPLT VT0, x1, VM0
+    xvbitsel.v VM0, VM0, x1, VT0
+    xvbitsel.v VI0, VINC8, VINC4, VT0
+    li.d TEMP, 1 //处理尾数相等时取最小序号
+    movgr2fr.w $f17, TEMP
+    ffint.s.w $f17, $f17
+    xvfcmp.ceq.s VT0, VM0, x1
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L262
+    XVCMPLT VT0, VI1, VI0
+    xvbitsel.v VI0, VI0, VI1, VT0
+    .align 3
+
+.L262:
+    xvfcmp.ceq.s VT0, VM0, x2
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L272
+    XVCMPLT VT0, VI2, VI0
+    xvbitsel.v VI0, VI0, VI2, VT0
+    .align 3
+
+.L272:
+    xvfcmp.ceq.s VT0, VM0, x3
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L282
+    XVCMPLT VT0, VI3, VI0
+    xvbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L282:
+    xvfcmp.ceq.s VT0, VM0, x4
+    fcmp.ceq.s $fcc0, $f23, $f17
+    bceqz $fcc0, .L292
+    XVCMPLT VT0, VI4, VI0
+    xvbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L292:
+    CMPLT $fcc0, $f13, $f15
+    fsel $f15, $f15, $f13, $fcc0
+    fsel $f20, $f20, $f16, $fcc0
+    MTG i0, $f20
+#endif
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f9, $f15
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/imin_lsx.S b/kernel/loongarch64/imin_lsx.S
new file mode 100644
index 000000000..a0c411e7a
--- /dev/null
+++ b/kernel/loongarch64/imin_lsx.S
@@ -0,0 +1,428 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N $r4
+#define X $r5
+#define INCX $r6
+#define I $r12
+#define t1 $r13
+#define t2 $r15
+#define t3 $r18
+#define t4 $r16
+#define i0 $r17
+#define i1 $r14
+#define TEMP $r19
+#define x1 $vr9
+#define x2 $vr10
+#define x3 $vr11
+#define x4 $vr12
+#define VX0 $vr13
+#define VX1 $vr14
+#define VM0 $vr15
+#define VM1 $vr16
+#define VI0 $vr20
+#define VI1 $vr21
+#define VI2 $vr22
+#define VI3 $vr8
+#define VI4 $vr19
+#define VT0 $vr23
+
+    PROLOGUE
+    li.d  i0, 0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    bne INCX, TEMP, .L20
+    vld VM0, X, 0
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    srai.d  I, N, 3
+    bge   $r0, I, .L21
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    srai.d  I, N, 3
+    bge   $r0, I, .L21
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vadd.d VI1, VI1, $vr18
+    vld VX1, X, 2 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    vld VX0, X, 4 * SIZE
+    vadd.d VI1, VI2, $vr17
+    vld VX1, X, 6 * SIZE
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x3, x1
+    addi.d X, X, 8 * SIZE
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, x1, VM0
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    vadd.w VI1, VI1, $vr18
+    vld VX1, X, 4 * SIZE
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM1, VM0
+    addi.d X, X, 8 * SIZE
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L10
+    .align 3
+
+.L15:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VCMPLT VT0, x2, x1
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    VCMPLT VT0, x4, x3
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    VCMPLT VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+    b .L26
+#endif
+    .align 3
+
+.L20: // INCX!=1
+    move TEMP, X
+#ifdef DOUBLE
+    addi.d i0, i0, 1
+    ld.d t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.d t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.d VM0, t2, 1
+    slli.d i0, i0, 1 //2
+    vreplgr2vr.d $vr17, i0
+    slli.d i0, i0, 1 //4
+    vreplgr2vr.d $vr18, i0
+    addi.d i0, i0, -7
+    vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI1, i0, 1
+    addi.d i0, i0, 3
+    vinsgr2vr.d VI0, i0, 0 //1
+    addi.d i0, i0, 1
+    vinsgr2vr.d VI0, i0, 1 //2
+#else
+    addi.w i0, i0, 1
+    ld.w t1, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t1, 0
+    srai.d I, N, 3
+    bge $r0, I, .L21
+    ld.w t2, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t3, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    ld.w t4, TEMP, 0 * SIZE
+    add.d TEMP, TEMP, INCX
+    vinsgr2vr.w VM0, t2, 1
+    vinsgr2vr.w VM0, t3, 2
+    vinsgr2vr.w VM0, t4, 3
+    slli.w i0, i0, 2 //4
+    vreplgr2vr.w $vr17, i0
+    slli.w i0, i0, 1 //8
+    vreplgr2vr.w $vr18, i0
+    addi.w i0, i0, -15
+    vinsgr2vr.w VI1, i0, 0 //initialize the index value for vectorization
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI1, i0, 3
+    addi.w i0, i0, 5
+    vinsgr2vr.w VI0, i0, 0 //1
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 1 //2
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 2 //3
+    addi.w i0, i0, 1
+    vinsgr2vr.w VI0, i0, 3 //4
+#endif
+    .align 3
+
+.L24:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI1, $vr18
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    vbitsel.v x1, VX0, VX1, VT0
+    vbitsel.v x2, VI1, VI2, VT0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vadd.d VI1, VI2, $vr17
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vadd.d VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    vbitsel.v x3, VX0, VX1, VT0
+    vbitsel.v x4, VI1, VI2, VT0
+    VCMPLT VT0, x3, x1
+    vbitsel.v x1, x1, x3, VT0
+    vbitsel.v x2, x2, x4, VT0
+    VCMPLT VT0, x1, VM0
+    addi.d I, I, -1
+    vbitsel.v VM0, VM0, x1, VT0
+    vbitsel.v VI0, VI0, x2, VT0
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    vadd.w VI1, VI1, $vr18
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vadd.w VI2, VI1, $vr17
+    VCMPLT VT0, VX1, VX0
+    addi.d I, I, -1
+    vbitsel.v VM1, VX0, VX1, VT0
+    vbitsel.v VI2, VI1, VI2, VT0
+    VCMPLT VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    blt $r0, I, .L24
+    .align 3
+
+.L25:
+#ifdef DOUBLE
+    vreplvei.d VI1, VI0, 0
+    vreplvei.d VI2, VI0, 1
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    fcmp.ceq.d $fcc0, $f10, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI2
+    vbitsel.v VI0, VI2, VI1, VT0
+    b .L27
+#else
+    vreplvei.w VI1, VI0, 0
+    vreplvei.w VI2, VI0, 1
+    vreplvei.w VI3, VI0, 2
+    vreplvei.w VI4, VI0, 3
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VCMPLT VT0, x2, x1
+    vbitsel.v VM1, x1, x2, VT0
+    vbitsel.v $vr17, VI1, VI2, VT0
+    VCMPLT VT0, x4, x3
+    vbitsel.v VM0, x3, x4, VT0
+    vbitsel.v $vr18, VI3, VI4, VT0
+    VCMPLT VT0, VM1, VM0
+    vbitsel.v VM0, VM0, VM1, VT0
+    vbitsel.v VI0, $vr18, $vr17, VT0
+    fcmp.ceq.d $fcc0, $f15, $f9
+    bceqz $fcc0, .L26
+    VCMPLT VT0, VI1, VI0
+    vbitsel.v VI0, VI0, VI1, VT0
+#endif
+    .align 3
+
+.L26:
+#ifdef DOUBLE
+    VCMPLT VT0, x2, x1
+    vbitsel.v VM0, x1, x2, VT0
+    vbitsel.v VI0, VI1, VI2, VT0
+#else
+    fcmp.ceq.d $fcc0, $f15, $f10
+    bceqz $fcc0, .L27
+    VCMPLT VT0, VI2, VI0
+    vbitsel.v VI0, VI0, VI2, VT0
+#endif
+    .align 3
+
+.L27:
+#ifndef DOUBLE
+    fcmp.ceq.d $fcc0, $f15, $f11
+    bceqz $fcc0, .L28
+    VCMPLT VT0, VI3, VI0
+    vbitsel.v VI0, VI0, VI3, VT0
+    .align 3
+
+.L28:
+    fcmp.ceq.d $fcc0, $f15, $f12
+    bceqz $fcc0, .L29
+    VCMPLT VT0, VI4, VI0
+    vbitsel.v VI0, VI0, VI4, VT0
+    .align 3
+
+.L29:
+#endif
+    MTG i0, $f20
+    .align 3
+
+.L21: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    srai.d i1, N, 3
+    slli.d i1, i1, 3
+    addi.d i1, i1, 1 //current index
+    movgr2fr.d $f21, i1
+    movgr2fr.d $f20, i0
+    .align 3
+
+.L22:
+    fld.d $f9, X, 0
+    addi.d I, I, -1
+    CMPLT $fcc0, $f9, $f15
+    add.d  X, X, INCX
+    fsel $f15, $f15, $f9, $fcc0
+    fsel $f20, $f20, $f21, $fcc0
+    addi.d i1, i1, 1
+    movgr2fr.d $f21, i1
+    blt $r0, I, .L22
+    MTG i0, $f20
+    .align 3
+
+.L999:
+    move $r4, $r17
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S
new file mode 100644
index 000000000..d097b3045
--- /dev/null
+++ b/kernel/loongarch64/loongarch64_asm.S
@@ -0,0 +1,673 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#if __loongarch_grlen == 64
+#define LA_REG    int64_t
+#define REG_SIZE  8
+#define REG_LOG   3
+#define PTR_ADDI  addi.d
+#define PTR_ADD   add.d
+#define PTR_SUB   sub.d
+#define PTR_LD    ld.d
+#define PTR_ST    st.d
+#define PTR_SLLI  slli.d
+#define PTR_SRLI  srli.d
+#define PTR_SRAI  srai.d
+#define PTR_MUL   mul.d
+#define PTR_ALSL  alsl.d
+#elif __loongarch_grlen == 32
+#define LA_REG    int32_t
+#define REG_SIZE  4
+#define REG_LOG   2
+#define PTR_ADDI  addi.w
+#define PTR_ADD   add.w
+#define PTR_SUB   sub.w
+#define PTR_LD    ld.w
+#define PTR_ST    st.w
+#define PTR_SLLI  slli.w
+#define PTR_SRLI  srli.w
+#define PTR_SRAI  srai.w
+#define PTR_MUL   mul.w
+#define PTR_ALSL  alsl.w
+#else
+// If neither of the above two conditions is supported, it means this is an early
+// internal toolchain. To ensure maximum compatibility, the following approach is taken:
+#define LA_REG    int64_t
+#define REG_SIZE  8
+#define REG_LOG   3
+#define PTR_ADDI  addi.d
+#define PTR_ADD   add.d
+#define PTR_SUB   sub.d
+#define PTR_LD    ld.d
+#define PTR_ST    st.d
+#define PTR_SLLI  slli.d
+#define PTR_SRLI  srli.d
+#define PTR_SRAI  srai.d
+#define PTR_MUL   mul.d
+#define PTR_ALSL  alsl.d
+#endif
+
+#if __loongarch_frlen == 64
+#define FREG_SIZE 8
+#define FREG_LOG  3
+#define PTR_FLD   fld.d
+#define PTR_FST   fst.d
+#elif __loongarch_frlen == 32
+#define FREG_SIZE 4
+#define FREG_LOG  2
+#define PTR_FLD   fld.s
+#define PTR_FST   fst.s
+#else
+// If neither of the above two conditions is supported, it means this is an early
+// internal toolchain. To ensure maximum compatibility, the following approach is taken:
+#define FREG_SIZE 8
+#define FREG_LOG  3
+#define PTR_FLD   fld.d
+#define PTR_FST   fst.d
+#endif
+
+// The max registers available to the user which
+// do not need to be preserved across calls.
+// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
+#define MAX_INT_CALLER_SAVED 17
+#define MAX_FP_CALLER_SAVED  24
+
+.altmacro // Enable alternate macro mode
+
+.macro push_if_used regs, fregs
+.if \regs > MAX_INT_CALLER_SAVED
+    PTR_ADDI      $sp,    $sp,    -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
+    push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
+.endif
+.if \fregs > MAX_FP_CALLER_SAVED
+    PTR_ADDI      $sp,    $sp,    -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
+    push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
+.endif
+.endm // End push_if_used
+.macro pop_if_used regs, fregs
+.if \fregs > MAX_FP_CALLER_SAVED
+    pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
+    PTR_ADDI      $sp,    $sp,    (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
+.endif
+.if \regs > MAX_INT_CALLER_SAVED
+    pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
+    PTR_ADDI      $sp,    $sp,    (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
+.endif
+.endm // End pop_if_used
+.macro push_regs from, to
+    PTR_ST    $s\()\from,     $sp,    \from << REG_LOG
+.if \to - \from
+    push_regs %from + 1, \to
+.endif
+.endm // End push_regs
+.macro pop_regs from, to
+    PTR_LD    $s\()\from,     $sp,    \from << REG_LOG
+.if \to - \from
+    pop_regs %from + 1, \to
+.endif
+.endm // End pop_regs
+.macro push_fregs from, to
+    PTR_FST   $fs\()\from,    $sp,    \from << FREG_LOG
+.if \to - \from
+    push_fregs %from + 1, \to
+.endif
+.endm // End push_fregs
+.macro pop_fregs from, to
+    PTR_FLD   $fs\()\from,    $sp,    \from << FREG_LOG
+.if \to - \from
+    pop_fregs %from + 1, \to
+.endif
+.endm // End pop_fregs
+
+//
+// Instruction Related Macros
+//
+// GLD
+//
+.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ld    \out,   \src,   \offset
+.else
+    \pre_op\()ld.\suf_op    \out,   \src,   \offset
+.endif
+.ifnb \more
+    GLD \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// GLD_INC
+//
+.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ld    \out,   \src,   \offset
+.else
+    \pre_op\()ld.\suf_op    \out,   \src,   \offset
+.endif
+    PTR_ADDI  \src,   \src,   \inc
+.ifnb \more
+    GLD_INC \pre_op, \suf_op, \inc, \more
+.endif
+.endm
+//
+// GLDX is same as GLD except the stride is a register
+//
+.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()ldx    \out,   \src,   \offset
+.else
+    \pre_op\()ldx.\suf_op    \out,   \src,   \offset
+.endif
+.ifnb \more
+    GLDX \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GLDREPL
+//
+.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
+    \pre_op\()ldrepl.\suf_op    \out,   \src,   \offset
+.ifnb \more
+    GLDREPL  \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GST
+//
+.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
+.ifeqs "\suf_op", "0"
+    \pre_op\()st    \src,   \dst,   \offset
+.else
+    \pre_op\()st.\suf_op \src,  \dst, \offset
+.endif
+.ifnb \more
+    GST \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMUL
+//
+.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()mul.\suf_op   \out,   \in0,   \in1
+.ifnb \more
+    GMUL \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMADD
+//
+.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
+    \pre_op\()madd.\suf_op \out, \in0, \in1, \in2
+.ifnb \more
+    GMADD \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GADD
+//
+.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()add.\suf_op \out, \in0, \in1
+.ifnb \more
+    GADD \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GADDI
+//
+.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()addi.\suf_op  \out,   \in0,   \in1
+.ifnb \more
+    GADDI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GSUB
+//
+.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()sub.\suf_op \out, \in0, \in1
+.ifnb \more
+    GSUB \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GSLLI
+//
+.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()slli.\suf_op  \out,   \in0,   \in1
+.ifnb \more
+    GSLLI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GINSVE0
+//
+.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()insve0.\suf_op    \out,   \in0,   \in1
+.ifnb \more
+    GINSVE0 \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GXOR
+//
+.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()xor.\suf_op    \out,   \in0,   \in1
+.ifnb \more
+    GXOR \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GPERMI
+//
+.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
+    \pre_op\()permi.\suf_op     \out,   \in0,   \in1
+.ifnb \more
+    GPERMI \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GNMSUB
+//
+.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
+    \pre_op\()nmsub.\suf_op     \out,   \in0,   \in1,   \in2
+.ifnb \more
+    GNMSUB \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GPRELD
+//
+.macro GPRELD in0:req, in1:req, in2:req, more:vararg
+    preld \in0, \in1, \in2
+.ifnb \more
+    GPRELD \more
+.endif
+.endm
+
+//
+// Compound instructions
+//
+// GACC: Accumulate the values of vector registers
+//
+.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvpermi.q              \out,   \in,    0x01
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifeqs "\suf_op", "s"
+    xvpackod.w             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "vf"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifeqs "\suf_op", "s"
+    vpackod.w              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "xv"
+    xvpermi.q              \out,   \in,    0x01
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "d"
+    xvpackod.w             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "w"
+    xvpackod.h             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "h"
+    xvpackod.b             \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+.endif
+.endif
+
+.ifeqs "\pre_op", "v"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "d"
+    vpackod.w              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "w"
+    vpackod.h              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.ifnc "\suf_op", "h"
+    vpackod.b              \in,    \out,   \out
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+.endif
+.endif
+
+.ifnb \more
+    GACC \pre_op, \suf_op, \more
+.endif
+.endm
+//
+// GMOV
+//
+.macro GMOV pre_op:req, out:req, in:req, more:vararg
+    \pre_op\()or.v  \out, \in, \in
+.ifnb \more
+    GMOV \pre_op, \more
+.endif
+.endm
+
+//
+// GCOMPLEXACC: Complex accumulate the values of vector registers
+// pre_op: xvf or vf, differentiate between LSX or LASX instruction
+// suf_op: s or d, differentiate between single precision or double precision complex numbers
+// Note: When "pre_op = xvf && suf_op = s", in will be modified.
+//
+.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvpermi.q              \out,   \in,    0x01
+.ifeqs "\suf_op", "s"
+    \pre_op\()add.\suf_op  \in,    \out,   \in
+    xvpackod.d             \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.else
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.endif
+.endif
+
+.ifeqs "\pre_op", "vf"
+.ifeqs "\suf_op", "s"
+    vpackod.d              \out,   \in,    \in
+    \pre_op\()add.\suf_op  \out,   \out,   \in
+.else
+    vor.v \out, \in, \in
+.endif
+.endif
+
+
+.ifnb \more
+    GCOMPLEXACC \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// GCOMPLEXMUL: Complex multiplication, out = in0 * in1
+// xconj: default value 0.
+// if !(xconj)
+//     out_r = in0_r * in1_r - in0_i * in1_i;
+//     out_i = in0_r * in1_i + in0_i * in1_r;
+// else
+//     out_r = in0_r * in1_r + in0_i * in1_i;
+//     out_i = in0_r * in1_i - in0_i * in1_r;
+// pre_op: xvf or vf, differentiate between LSX or LASX instruction
+// suf_op: s or d, differentiate between single precision or double precision complex numbers
+//
+.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvxor.v           \tmp1,     \tmp1,     \tmp1
+.ifeqs "\suf_op", "s"
+    xvpackev.w  \tmp0,     \in0,      \in0
+.else
+    xvpackev.d  \tmp0,     \in0,      \in0
+.endif
+.else
+    vxor.v           \tmp1,     \tmp1,     \tmp1
+.ifeqs "\suf_op", "s"
+    vpackev.w  \tmp0,     \in0,      \in0
+.else
+    vpackev.d  \tmp0,     \in0,      \in0
+.endif
+.endif
+
+    \pre_op\()sub.\suf_op    \tmp1,     \tmp1,     \in0
+
+.ifeqs "\pre_op", "xvf"
+.ifeqs "\suf_op", "s"
+.ifeqs "\xconj", "0"
+    xvpackod.w  \tmp1,     \in0,      \tmp1
+.else
+    xvpackod.w  \tmp1,     \tmp1,     \in0
+.endif
+    xvshuf4i.w  \tmp2,     \in1,      0xb1
+.else
+.ifeqs "\xconj", "0"
+    xvpackod.d  \tmp1,     \in0,      \tmp1
+.else
+    xvpackod.d  \tmp1,     \tmp1,     \in0
+.endif
+    xvshuf4i.d  \tmp2,     \in1,      0x0b
+.endif
+.else
+.ifeqs "\suf_op", "s"
+.ifeqs "\xconj", "0"
+    vpackod.w  \tmp1,     \in0,      \tmp1
+.else
+    vpackod.w  \tmp1,     \tmp1,     \in0
+.endif
+    vshuf4i.w  \tmp2,     \in1,      0xb1
+.else
+.ifeqs "\xconj", "0"
+    vpackod.d  \tmp1,     \in0,      \tmp1
+.else
+    vpackod.d  \tmp1,     \tmp1,     \in0
+.endif
+    vshuf4i.d  \tmp2,     \in1,      0x0b
+.endif
+.endif
+
+    \pre_op\()mul.\suf_op    \out,      \tmp0,     \in1
+    \pre_op\()madd.\suf_op   \out,      \tmp1,     \tmp2,     \out
+
+.ifnb \more
+    GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2
+// xconj: default value 0
+// conj: default value 0
+// if !(CONJ)
+// if !(XCONJ)
+//     out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
+//     out_i = in0_r * in1_i + in0_i * in1_r + in2_i;
+// else
+//     out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
+//     out_i = in0_r * in1_i - in0_i * in1_r + in2_i;
+// else
+// if !(XCONJ)
+//     out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
+//     out_i = in2_i - (in0_r * in1_i - in0_i * in1_r);
+// else
+//     out_r = in0_r * in1_r   - in0_i * in1_i + in2_r;
+//     out_i = in2_i - (in0_r * in1_i + in0_i * in1_r);
+// pre_op: xvf or vf, differentiate between LSX or LASX instruction
+// suf_op: s or d, differentiate between single precision or double precision complex numbers
+//
+.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
+.ifeqs "\pre_op", "xvf"
+    xvxor.v           \tmp1,     \tmp1,     \tmp1
+.ifeqs "\suf_op", "s"
+    xvpackev.w  \tmp0,     \in0,      \in0
+.else
+    xvpackev.d  \tmp0,     \in0,      \in0
+.endif
+.else
+    vxor.v           \tmp1,     \tmp1,     \tmp1
+.ifeqs "\suf_op", "s"
+    vpackev.w  \tmp0,     \in0,      \in0
+.else
+    vpackev.d  \tmp0,     \in0,      \in0
+.endif
+.endif
+
+    \pre_op\()madd.\suf_op   \tmp2,      \tmp0,     \in1,      \in2
+.ifeqs "\conj", "1"
+    \pre_op\()nmsub.\suf_op  \tmp0,      \tmp0,     \in1,      \in2
+.ifeqs "\pre_op", "xvf"
+.ifeqs "\suf_op", "s"
+    xvshuf4i.w  \tmp0,     \tmp0,      0xb1
+    xvpackev.w  \out,      \tmp0,      \tmp2
+.else
+    xvshuf4i.d  \tmp0,     \tmp0,      0x0b
+    xvpackev.d  \out,      \tmp0,      \tmp2
+.endif
+.else
+.ifeqs "\suf_op", "s"
+    vshuf4i.w  \tmp0,     \tmp0,      0xb1
+    vpackev.w  \out,      \tmp0,      \tmp2
+.else
+    vshuf4i.d  \tmp0,     \tmp0,      0x0b
+    vpackev.d  \out,      \tmp0,      \tmp2
+.endif
+.endif /* pre_op = xvf */
+.else
+    \pre_op\()add.\suf_op    \out,     \tmp2,     \tmp1
+.endif /* conj = 1 */
+
+    \pre_op\()sub.\suf_op    \tmp1,     \tmp1,     \in0
+
+.ifeqs "\pre_op", "xvf"
+.ifeqs "\suf_op", "s"
+.ifeqs "\conj",  "0"
+.ifeqs "\xconj", "0"
+    xvpackod.w  \tmp1,     \in0,      \tmp1
+.else
+    xvpackod.w  \tmp1,     \tmp1,     \in0
+.endif
+.else
+.ifeqs "\xconj", "0"
+    xvpackod.w  \tmp1,     \in0,     \in0
+.else
+    xvpackod.w  \tmp1,     \tmp1,     \tmp1
+.endif
+.endif
+    xvshuf4i.w  \tmp2,     \in1,      0xb1
+.else
+.ifeqs "\conj",  "0"
+.ifeqs "\xconj", "0"
+    xvpackod.d  \tmp1,     \in0,      \tmp1
+.else
+    xvpackod.d  \tmp1,     \tmp1,     \in0
+.endif
+.else
+.ifeqs "\xconj", "0"
+    xvpackod.d  \tmp1,     \in0,     \in0
+.else
+    xvpackod.d  \tmp1,     \tmp1,    \tmp1
+.endif
+.endif
+    xvshuf4i.d  \tmp2,     \in1,      0x0b
+.endif
+.else
+.ifeqs "\suf_op", "s"
+.ifeqs "\conj",  "0"
+.ifeqs "\xconj", "0"
+    vpackod.w  \tmp1,     \in0,      \tmp1
+.else
+    vpackod.w  \tmp1,     \tmp1,     \in0
+.endif
+.else
+.ifeqs "\xconj", "0"
+    vpackod.w  \tmp1,     \in0,     \in0
+.else
+    vpackod.w  \tmp1,     \tmp1,    \tmp1
+.endif
+.endif
+    vshuf4i.w  \tmp2,     \in1,      0xb1
+.else
+.ifeqs "\conj",  "0"
+.ifeqs "\xconj", "0"
+    vpackod.d  \tmp1,     \in0,      \tmp1
+.else
+    vpackod.d  \tmp1,     \tmp1,     \in0
+.endif
+.else
+.ifeqs "\xconj", "0"
+    vpackod.d  \tmp1,     \in0,     \in0
+.else
+    vpackod.d  \tmp1,     \tmp1,    \tmp1
+.endif
+.endif
+    vshuf4i.d  \tmp2,     \in1,      0x0b
+.endif
+.endif
+
+    \pre_op\()madd.\suf_op   \out,      \tmp1,     \tmp2,     \out
+
+.ifnb \more
+    GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more
+.endif
+.endm
+
+//
+// Media Related Macros
+//
+.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1
+    \pre_op\()ilvl.\suf_op \out0, \in0, \in1
+    \pre_op\()ilvh.\suf_op \out1, \in0, \in1
+.endm
+.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1
+    \pre_op\()pickev.\suf_op \out0, \in0, \in1
+    \pre_op\()pickod.\suf_op \out1, \in0, \in1
+.endm
+
+//
+// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors,
+// has no pre_op param. 128-bit vector instructions are not supported.
+//
+.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
+                       vt0, vt1
+    GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0
+    GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2
+    GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3
+    GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02
+.endm
+
+.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \
+                       in0, in1, in2, in3, in4, in5, in6, in7, \
+                       tmp0, tmp1, tmp2, tmp3
+    GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0
+    GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1
+    GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0
+    GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2
+
+    GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4
+    GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5
+    GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0
+    GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2
+
+    GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3
+
+    GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \
+                  \out2, \out6, 0x02, \out3, \out7, 0x02, \
+                  \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \
+                  \out6, \tmp2, 0x31, \out7, \tmp3, 0x31
+.endm
diff --git a/kernel/loongarch64/max_lasx.S b/kernel/loongarch64/max_lasx.S
new file mode 100644
index 000000000..3215ae394
--- /dev/null
+++ b/kernel/loongarch64/max_lasx.S
@@ -0,0 +1,229 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM2, VX2, VX3
+    XVFMAX    VM0, VM0, VM1
+    XVFMAX    VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAX    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM2, VX2, VX3
+    XVFMAX    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMAX    VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAX    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAX    VM1, VX0, VX1
+    XVFMAX    VM2, VX2, VX3
+    XVFMAX    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMAX    VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/max_lsx.S b/kernel/loongarch64/max_lsx.S
new file mode 100644
index 000000000..a2100875d
--- /dev/null
+++ b/kernel/loongarch64/max_lsx.S
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM2, VX2, VX3
+    VFMAX    VM0, VM0, VM1
+    VFMAX    VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAX    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM2, VX2, VX3
+    VFMAX    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAX    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAX    VM1, VX0, VX1
+    VFMAX    VM2, VX2, VX3
+    VFMAX    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAX  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/min_lasx.S b/kernel/loongarch64/min_lasx.S
new file mode 100644
index 000000000..890c8882d
--- /dev/null
+++ b/kernel/loongarch64/min_lasx.S
@@ -0,0 +1,229 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM2, VX2, VX3
+    XVFMIN    VM0, VM0, VM1
+    XVFMIN    VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMIN    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM2, VX2, VX3
+    XVFMIN    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMIN    VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMIN    VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMIN    VM1, VX0, VX1
+    XVFMIN    VM2, VX2, VX3
+    XVFMIN    VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMIN    VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/min_lsx.S b/kernel/loongarch64/min_lsx.S
new file mode 100644
index 000000000..1bc32a0ed
--- /dev/null
+++ b/kernel/loongarch64/min_lsx.S
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM2, VX2, VX3
+    VFMIN    VM0, VM0, VM1
+    VFMIN    VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMIN    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM2, VX2, VX3
+    VFMIN    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMIN    VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMIN    VM1, VX0, VX1
+    VFMIN    VM2, VX2, VX3
+    VFMIN    VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMIN  $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/rot_lasx.S b/kernel/loongarch64/rot_lasx.S
new file mode 100644
index 000000000..71378e0b2
--- /dev/null
+++ b/kernel/loongarch64/rot_lasx.S
@@ -0,0 +1,1602 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VT0    $xr10
+#define VT1    $xr18
+#define VXC    $xr23
+#define VXS    $xr9
+#define VXZ    $xr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+#ifdef DOUBLE
+    movfr2gr.d t1, C
+    xvreplgr2vr.d VXC, t1
+    movfr2gr.d t2, S
+    xvreplgr2vr.d VXS, t2
+    movfr2gr.d t3, a1
+    xvreplgr2vr.d VXZ, t3
+#else
+    movfr2gr.s t1, C
+    xvreplgr2vr.w VXC, t1
+    movfr2gr.s t2, S
+    xvreplgr2vr.w VXS, t2
+    movfr2gr.s t3, a1
+    xvreplgr2vr.w VXZ, t3
+#endif
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+#endif
+    XVMUL VT0, VX0, VXC
+    XVFMADD VT0, VX2, VXS, VT0
+    XVMUL VT1, VX0, VXS
+    XVMSUB VT1, VX2, VXC, VT1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+#ifdef DOUBLE
+    XVMUL VT0, VX1, VXC
+    XVFMADD VT0, VX3, VXS, VT0
+    XVMUL VT1, VX1, VXS
+    XVMSUB VT1, VX3, VXC, VT1
+    xvst VT0, X, 4 * SIZE
+    xvst VT1, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+#endif
+    XVMUL VT0, VX0, VXC
+    XVMUL VT1, VX2, VXC
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+#ifdef DOUBLE
+    XVMUL VT0, VX1, VXC
+    XVMUL VT1, VX3, VXC
+    xvst VT0, X, 4 * SIZE
+    xvst VT1, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+#endif
+    XVMUL VT0, VX2, VXS
+    XVMUL VT1, VX0, VXS
+    XVFSUB VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, Y, 0 * SIZE
+#ifdef DOUBLE
+    XVMUL VT0, VX3, VXS
+    XVMUL VT1, VX1, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvst VT0, X, 4 * SIZE
+    xvst VT1, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 4 * SIZE
+    xvst VXZ, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L120
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L122 // C!=0 S==0
+    b .L121 // C!=0 S!=0
+    .align 3
+
+.L120:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L124 // C==0 S==0
+    b .L123 // C==0 S!=0
+    .align 3
+
+.L121: // C!=0 S!=0
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+#else
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+#endif
+    XVMUL VT0, VX0, VXC
+    XVFMADD VT0, VX2, VXS, VT0
+    XVMUL VT1, VX0, VXS
+    XVMSUB VT1, VX2, VXC, VT1
+
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvst VT0, X, 0 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    XVMUL VT0, VX1, VXC
+    XVFMADD VT0, VX3, VXS, VT0
+    XVMUL VT1, VX1, VXS
+    XVMSUB VT1, VX3, VXC, VT1
+    xvst VT0, X, 4 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+#else
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // C!=0 S==0
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmul.d VT1, VX2, VXC
+    xvld VX1, X, 4 * SIZE
+    xvst VT0, X, 0 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmul.d VT1, VX3, VXC
+    addi.d  I, I, -1
+    xvst VT0, X, 4 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX0, VXC
+    xvfmul.s VT1, VX2, VXC
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // C==0 S!=0
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX2, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX1, VXS
+    xvfsub.d VT1, VXZ, VT1
+    addi.d  I, I, -1
+    xvst VT0, X, 4 * SIZE
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX2, VXS
+    xvfmul.s VT1, VX0, VXS
+    xvfsub.s VT1, VXZ, VT1
+    xvst VT0, X, 0 * SIZE
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // C==0 S==0
+    xvst VXZ, X, 0 * SIZE
+#ifdef DOUBLE
+    xvst VXZ, X, 0 * SIZE
+    xvst VXZ, X, 4 * SIZE
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvst VXZ, X, 0 * SIZE
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L124
+    move Y, YY
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L210
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L212 // C!=0 S==0
+    b .L211 // C!=0 S!=0
+    .align 3
+
+.L210:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L214 // C==0 S==0
+    b .L213 // C==0 S!=0
+    .align 3
+
+.L211: // C!=0 S!=0
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VXC, VX0
+    xvfmadd.d VT0, VX2, VXS, VT0
+    xvfmul.d VT1, VXS, VX0
+    xvfmsub.d VT1, VX2, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VX1, VXC
+    xvfmadd.d VT0, VX3, VXS, VT0
+    xvfmul.d VT1, VX1, VXS
+    xvfmsub.d VT1, VX3, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXC, VX0
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // C!=0 S==0
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VXC, VX0
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX2, VXC
+    xvst VT1, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VX1, VXC
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX3, VXS
+    xvst VT1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXC, VX0
+    xvfmul.s VT1, VX2, VXC
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // C==0 S!=0
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VXS, VX2
+    xvfmul.d VT1, VXS, VX0
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX1, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VT0, VXS, VX2
+    xvfmul.s VT1, VXS, VX0
+    xvfsub.s VT1, VXZ, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // C==0 S==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 4 * SIZE
+#else
+    xvstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvst VT1, Y, 0 * SIZE
+    xvstelm.w VXZ, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 7
+    add.d XX, XX, INCX
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmadd.d VT0, VX2, VXS, VT0
+    xvfmul.d VT1, VX0, VXS
+    xvfmsub.d VT1, VX2, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmadd.d VT0, VX3, VXS, VT0
+    xvfmul.d VT1, VX1, VXS
+    xvfmsub.d VT1, VX3, VXC, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s  VT0, VX0, VXC
+    xvfmadd.s VT0, VX2, VXS, VT0
+    xvfmul.s  VT1, VX0, VXS
+    xvfmsub.s VT1, VX2, VXC, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX0, VXC
+    xvfmul.d VT1, VX2, VXC
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX1, VXC
+    xvfmul.d VT1, VX3, VXC
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX0, VXC
+    xvfmul.s VT1, VX2, VXC
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX2, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvfmul.d VT0, VX3, VXS
+    xvfmul.d VT1, VX0, VXS
+    xvfsub.d VT1, VXZ, VT1
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VT1, YY, 0, 3
+    add.d YY, YY, INCY
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VT0, VX2, VXS
+    xvfmul.s VT1, VX0, VXS
+    xvfsub.s VT1, VXZ, VT1
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VT1, YY, 0, 7
+    add.d YY, YY, INCY
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, XX, 0, 7
+    add.d XX, XX, INCX
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+#ifdef DOUBLE
+    move X, XX
+    move Y, YY
+#endif
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    MUL  $f10, $f12, C
+    MADD $f10, $f13, S, $f10
+    ST   $f10, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL  $f20, $f12, S
+    MSUB $f20, $f13, C, $f20
+    ST   $f20, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/rot_lsx.S b/kernel/loongarch64/rot_lsx.S
new file mode 100644
index 000000000..3bb77aaec
--- /dev/null
+++ b/kernel/loongarch64/rot_lsx.S
@@ -0,0 +1,1790 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define C      $f0
+#define S      $f1
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define XX     $r18
+#define YY     $r19
+#define a1     $f12
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VT0    $vr10
+#define VT1    $vr18
+#define VXC    $vr23
+#define VXS    $vr9
+#define VXZ    $vr19
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT a1, a1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+#ifdef DOUBLE
+    movfr2gr.d t1, C
+    vreplgr2vr.d VXC, t1
+    movfr2gr.d t2, S
+    vreplgr2vr.d VXS, t2
+    movfr2gr.d t3, a1
+    vreplgr2vr.d VXZ, t3
+#else
+    movfr2gr.s t1, C
+    vreplgr2vr.w VXC, t1
+    movfr2gr.s t2, S
+    vreplgr2vr.w VXS, t2
+    movfr2gr.s t3, a1
+    vreplgr2vr.w VXZ, t3
+#endif
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L110
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L112 // C!=0 S==0
+    b .L111 // C!=0 S!=0
+    .align 3
+
+.L110:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L114 // C==0 S==0
+    b .L113 // C==0 S!=0
+    .align 3
+
+.L111: // C!=0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+#endif
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+#ifdef DOUBLE
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+#endif
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+#ifdef DOUBLE
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L112: // C!=0 S==0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+#endif
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+#ifdef DOUBLE
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+#endif
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+#ifdef DOUBLE
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L112
+    b .L997
+    .align 3
+
+.L113: // C==0 S!=0
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+#endif
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+    vst VT1, Y, 0 * SIZE
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+#ifdef DOUBLE
+    vst VT0, X, 2 * SIZE
+    vst VT1, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+#endif
+    vst VT0, X, 4 * SIZE
+    vst VT1, Y, 4 * SIZE
+#ifdef DOUBLE
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 6 * SIZE
+    vst VT1, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L113
+    b .L997
+    .align 3
+
+.L114: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 2 * SIZE
+    vst VXZ, Y, 2 * SIZE
+#endif
+    vst VXZ, X, 4 * SIZE
+    vst VXZ, Y, 4 * SIZE
+#ifdef DOUBLE
+    vst VXZ, X, 6 * SIZE
+    vst VXZ, Y, 6 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L114
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L120
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L122 // C!=0 S==0
+    b .L121 // C!=0 S!=0
+    .align 3
+
+.L120:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L124 // C==0 S==0
+    b .L123 // C==0 S!=0
+    .align 3
+
+.L121: // C!=0 S!=0
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+#endif
+    add.d Y, Y, INCY
+#ifndef DOUBLE
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+#endif
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vst VT0, X, 0 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+#endif
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vst VT0, X, 4 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L122: // C!=0 S==0
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+#else
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+#endif
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vst VT0, X, 0 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 4 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vst VT0, X, 4 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L122
+    b .L997
+    .align 3
+
+.L123: // C==0 S!=0
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    ld.d  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d  t2, Y, 0 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w  t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w  t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+#else
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+#endif
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 0 * SIZE
+#ifdef DOUBLE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX0, X, 2 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 2 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 6 * SIZE
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vst VT0, X, 4 * SIZE
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L123
+    b .L997
+    .align 3
+
+.L124: // C==0 S==0
+    vst VXZ, X, 0 * SIZE
+    vst VXZ, X, 4 * SIZE
+#ifdef DOUBLE
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L124
+#ifdef DOUBLE
+    move Y, YY
+#endif
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L210
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L212 // C!=0 S==0
+    b .L211 // C!=0 S!=0
+    .align 3
+
+.L210:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L214 // C==0 S==0
+    b .L213 // C==0 S!=0
+    .align 3
+
+.L211: // C!=0 S!=0
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+#endif
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VXS, VX0
+    VMSUB VT1, VX2, VXC, VT1
+#ifdef DOUBLE
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VXS, VX0
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 6 * SIZE
+#else
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L212: // C!=0 S==0
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+#endif
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VMUL VT1, VX2, VXC
+
+#ifdef DOUBLE
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VXC, VX0
+    VMUL VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXS
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXS
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    vst VT1, Y, 6 * SIZE
+#else
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXS
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L212
+    b .L997
+    .align 3
+
+.L213: // C==0 S!=0
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+#endif
+    add.d X, X, INCX
+    VMUL VT0, VXS, VX2
+    VMUL VT1, VXS, VX0
+    VFSUB VT1, VXZ, VT1
+
+#ifdef DOUBLE
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX2, Y, 2 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VXS, VX2
+    VMUL VT1, VXS, VX0
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 2 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 6 * SIZE
+#else
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX1, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L213
+    b .L997
+    .align 3
+
+.L214: // C==0 S==0
+#ifdef DOUBLE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+#else
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vst VT1, Y, 0 * SIZE
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+#endif
+    add.d XX, XX, INCX
+    vst VT1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+#ifdef DOUBLE
+    move X, XX
+#endif
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    move XX, X
+    CMPEQ $fcc0, C, a1
+    bcnez $fcc0, .L220
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L222 // C!=0 S==0
+    b .L221 // C!=0 S!=0
+    .align 3
+
+.L220:
+    CMPEQ $fcc0, S, a1
+    bcnez $fcc0, .L224 // C==0 S==0
+    b .L223 // C==0 S!=0
+    .align 3
+
+.L221: // C!=0 S!=0
+#ifdef DOUBLE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+    add.d X, X, INCX
+#else
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL  VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL  VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VFMADD VT0, VX2, VXS, VT0
+    VMUL VT1, VX0, VXS
+    VMSUB VT1, VX2, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VFMADD VT0, VX3, VXS, VT0
+    VMUL VT1, VX1, VXS
+    VMSUB VT1, VX3, VXC, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L221
+    b .L997
+    .align 3
+
+.L222: // C!=0 S==0
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+#ifndef DOUBLE
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX0, VXC
+    VMUL VT1, VX2, VXC
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX1, VXC
+    VMUL VT1, VX3, VXC
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    b .L997
+    .align 3
+
+.L223: // C==0 S!=0
+#ifdef DOUBLE
+    ld.d  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d  t2, X, 0 * SIZE
+    add.d X, X, INCX
+#else
+    ld.w  t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w  t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+#endif
+#ifdef DOUBLE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX2, t3, 0
+    vinsgr2vr.d VX2, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    ld.d t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t1, 0
+    vinsgr2vr.d VX3, t2, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    ld.d t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0 * SIZE
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VT1, YY, 0, 1
+#else
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX2, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    VMUL VT0, VX3, VXS
+    VMUL VT1, VX0, VXS
+    VFSUB VT1, VXZ, VT1
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VT1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VT1, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L223
+    b .L997
+    .align 3
+
+.L224: // C==0 S==0
+#ifdef DOUBLE
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VXZ, XX, 0, 3
+    add.d XX, XX, INCX
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L224
+#ifdef DOUBLE
+    move X, XX
+    move Y, YY
+#endif
+    b .L997
+    .align 3
+
+.L997:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
+    MUL  $f10, $f12, C
+    MADD $f10, $f13, S, $f10
+    ST   $f10, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL  $f20, $f12, S
+    MSUB $f20, $f13, C, $f20
+    ST   $f20, Y, 0 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/scal_lasx.S b/kernel/loongarch64/scal_lasx.S
new file mode 100644
index 000000000..48e2c0718
--- /dev/null
+++ b/kernel/loongarch64/scal_lasx.S
@@ -0,0 +1,282 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $xr12
+#define VX1 $xr13
+#define VT0 $xr14
+#define VT1 $xr15
+#define VALPHA $xr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    movgr2fr.d a2, TEMP
+    FFINT  a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    CMPEQ  $fcc0, ALPHA, a1
+    bcnez  $fcc0, .L20   //ALPHA==0
+    CMPEQ  $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    MTG  TEMP, ALPHA
+#ifdef DOUBLE
+    xvreplgr2vr.d VALPHA, TEMP
+#else
+    xvreplgr2vr.w VALPHA, TEMP
+#endif
+    move XX, X
+    .align 3
+
+.L10:   //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+.L11:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    xvfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvfmul.d VT1, VX1, VALPHA
+    xvstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.d VT1, XX, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmul.s VT0, VX0, VALPHA
+    xvstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 4
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 5
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 6
+    add.d XX, XX, INCX
+    xvstelm.w VT0, XX, 0, 7
+#endif
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L23:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+.L25:
+    xvxor.v VX0, VX0, VX0
+    xvst VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvst VX0, X, 4 * SIZE
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L27:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    MTG TEMP, ALPHA
+#ifdef DOUBLE
+    xvreplgr2vr.d VALPHA , TEMP
+#else
+    xvreplgr2vr.w VALPHA , TEMP
+#endif
+    .align 3
+
+.L31:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VT0, VX0, VALPHA
+    xvfmul.d VT1, VX1, VALPHA
+    xvst VT0, X, 0 * SIZE
+    xvst VT1, X, 4 * SIZE
+#else
+    xvfmul.s VT0, VX0, VALPHA
+    xvst VT0, X, 0 * SIZE
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L33:
+    LD    a1, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL   a1, ALPHA, a1
+    ST    a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/scal_lsx.S b/kernel/loongarch64/scal_lsx.S
new file mode 100644
index 000000000..1ffce7db2
--- /dev/null
+++ b/kernel/loongarch64/scal_lsx.S
@@ -0,0 +1,301 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N $r4
+#define ALPHA $f0
+#define X  $r7
+#define INCX $r8
+#define I $r12
+#define TEMP $r13
+#define t1 $r14
+#define t2 $r18
+#define t3 $r15
+#define t4 $r17
+#define XX $r16
+#define VX0 $vr12
+#define VX1 $vr13
+#define VT0 $vr14
+#define VT1 $vr15
+#define VALPHA $vr19
+#define a1 $f8
+#define a2 $f23
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    movgr2fr.d a2, TEMP
+    FFINT  a2, a2
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+    CMPEQ  $fcc0, ALPHA, a1
+    bcnez  $fcc0, .L20   //ALPHA==0
+    CMPEQ  $fcc0, ALPHA, a2
+    bcnez $fcc0, .L999  //ALPHA==1 return
+    srai.d I, N, 3
+    beq INCX, TEMP, .L30    //ALPHA！=0|1 and INCX==1
+    MTG  TEMP, ALPHA
+#ifdef DOUBLE
+    vreplgr2vr.d VALPHA, TEMP
+#else
+    vreplgr2vr.w VALPHA, TEMP
+#endif
+    move XX, X
+    .align 3
+
+.L10:   //ALPHA！=0|1 and INCX!=1
+    bge $r0, I, .L32
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT0, VX0, VALPHA
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vstelm.d VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vfmul.d VT1, VX1, VALPHA
+    vstelm.d VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.d VT1, XX, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vfmul.s VT0, VX0, VALPHA
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VT0, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT0, XX, 0, 3
+    add.d XX, XX, INCX
+    vfmul.s VT1, VX1, VALPHA
+    vstelm.w VT1, XX, 0, 0
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 1
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 2
+    add.d XX, XX, INCX
+    vstelm.w VT1, XX, 0, 3
+#endif
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    b .L32
+    .align 3
+
+.L20:
+    srai.d I, N, 3
+    beq INCX, TEMP, .L24
+    bge $r0, I, .L22
+    .align 3
+
+.L21:
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    ST a1, X, 0
+    add.d X, X, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L23:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L23
+    jirl $r0, $r1, 0
+    .align 3
+
+.L24:
+    bge $r0, I, .L26 /*N<8 INCX==1*/
+    .align 3
+
+.L25:
+    vxor.v VX0, VX0, VX0
+    vst VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vst VX0, X, 2 * SIZE
+    vst VX0, X, 4 * SIZE
+    vst VX0, X, 6 * SIZE
+#else
+    vst VX0, X, 4 * SIZE
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L25
+    .align 3
+
+.L26:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L27:
+    ST  a1, X, 0 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, SIZE
+    blt $r0, I, .L27
+    jirl $r0, $r1, 0
+    .align 3
+
+.L30:
+    bge $r0, I, .L32/*N<8 INCX==1*/
+    MTG  TEMP, ALPHA
+#ifdef DOUBLE
+    vreplgr2vr.d VALPHA , TEMP
+#else
+    vreplgr2vr.w VALPHA , TEMP
+#endif
+    .align 3
+
+.L31:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vfmul.d VT1, VX1, VALPHA
+    vld VX0, X, 4 * SIZE
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 2 * SIZE
+    vfmul.d VT0, VX0, VALPHA
+    vld VX1, X, 6 * SIZE
+    vst VT0, X, 4 * SIZE
+    vfmul.d VT1, VX1, VALPHA
+    vst VT1, X, 6 * SIZE
+    addi.d I, I, -1
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VT0, VX0, VALPHA
+    vfmul.s VT1, VX1, VALPHA
+    addi.d I, I, -1
+    vst VT0, X, 0 * SIZE
+    vst VT1, X, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L31
+    .align 3
+
+.L32:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L33:
+    LD    a1, X, 0 * SIZE
+    addi.d I, I, -1
+    MUL   a1, ALPHA, a1
+    ST    a1, X, 0 * SIZE
+    add.d X, X, INCX
+    blt $r0, I, .L33
+    jirl $r0, $r1, 0
+    .align 3
+
+.L999:
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
new file mode 100644
index 000000000..bd609394e
--- /dev/null
+++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S
@@ -0,0 +1,2348 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+* 2023/08/23 guxiwei
+* Parameter:
+*       SGEMM_DEFAULT_UNROLL_N  8
+*       SGEMM_DEFAULT_UNROLL_M  16
+*       SGEMM_DEFAULT_P         256
+*       SGEMM_DEFAULT_Q         256
+*       SGEMM_DEFAULT_R         1024
+*       A_PRE                   1024
+*       B_PRE                   256    // Enable prefetching for B results in a performance decrease, temporarily disabled.
+*
+*
+* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000:
+*       1 thread:       71.7 GFLOPS
+*       2 threads:     142.6 GFLOPS
+*       3 threads:     211.5 GFLOPS
+*       4 threads:     265.0 GFLOPS
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA  $f0   // param 4: alpha
+#define A      $r7   // param 5: ba
+#define B      $r8   // param 6: bb
+#define C      $r9   // param 7: bc
+#define LDC    $r10  // param 8: ldc
+
+#ifdef TRMMKERNEL
+#define OFFSET $r11  // param 9: offset
+#endif
+#define OFF    $r12
+
+/* Cycle control parameters */
+#define I      $r13
+#define J      $r14
+#define L      $r15
+#define TL     $r16
+/* Matrix address */
+#define A0     $r17
+#define B0     $r18
+#define C0     $r19
+#define C1     $r20
+#define C2     $r23
+#define C3     $r24
+#define C4     $r25
+#define C5     $r26
+#define C6     $r27
+#define C7     $r28
+#define T0     $r29
+#define T1     $r30
+#undef  ZERO
+#define ZERO   $r0
+
+/* LASX Vectors
+ * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data.
+ * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data.
+ * Use D0 to D15 to store intermediate values of the computation.
+ * Use VALPHA to store the broadcast value of alpha
+ */
+#define U0     $xr0
+#define U1     $xr1
+#define X0     $xr2
+#define X1     $xr3
+#define X2     $xr4
+#define X3     $xr5
+#define X4     $xr6
+#define X5     $xr7
+#define X6     $xr8
+#define X7     $xr9
+#define D0     $xr10
+#define D1     $xr11
+#define D2     $xr12
+#define D3     $xr13
+#define D4     $xr14
+#define D5     $xr15
+#define D6     $xr16
+#define D7     $xr17
+#define D8     $xr18
+#define D9     $xr19
+#define D10    $xr20
+#define D11    $xr21
+#define D12    $xr22
+#define D13    $xr23
+#define D14    $xr24
+#define D15    $xr25
+#define VALPHA $xr26
+
+/* Prefetch interval */
+#define A_PRE  0x400
+#define B_PRE  0x100
+
+// Loops outline:
+// .L_N8  <--------------------------------------------------------------------------------------------    /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */
+// |    .L_M16 <---------------------                                                                  |   /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */
+// |    |  .L_M16_TL1               |                                                                  |
+// |    |  .L_M16_L7                | The entire core loop of the function, KERNEK16x8                 |
+// |    |  .L_M16_L71               |                                                                  |
+// |    |  .L_M16_L0 ----------------                                                                  |
+// |    .L_M8                                                                                          |
+// |    |  .L_M8_TL1                |                                                                  |
+// |    |  .L_M8_L7                 | KERNEK8x8                                                        |
+// |    |  .L_M8_L71                |                                                                  |
+// |    |  .L_M8_L0                 |                                                                  |
+// |    .L_M4                                                                                          |
+// |    |  .L_M4_TL1                |                                                                  |
+// |    |  .L_M4_L7                 | KERNEK4x8                                                        |
+// |    |  .L_M4_L71                |                                                                  |
+// |    |  .L_M4_L0                 |                                                                  |
+// |    .L_M2                                                                                          |
+// |    |  .L_M2_TL1                |                                                                  |
+// |    |  .L_M2_L7                 | KERNEK2x8                                                        |
+// |    |  .L_M2_L71                |                                                                  |
+// |    |  .L_M2_L0                 |                                                                  |
+// |    .L_M1                                                                                          |
+// |    |  .L_M1_TL1                |                                                                  |
+// |    |  .L_M1_L7                 | KERNEK1x8                                                        |
+// |    |  .L_M1_L71                |                                                                  |
+// |    |  .L_M1_L0                 |                                                                  |
+// |    .L_M0------------------------------------------------------------------------------------------
+// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */
+// .L_N4
+// |    .L_N4_M16 <---------------------
+// |    |  .L_N4_M16_TL1               |
+// |    |  .L_N4_M16_L7                |  KERNEL16x4
+// |    |  .L_N4_M16_L71               |
+// |    |  .L_N4_M16_L0 ----------------
+// |    .L_N4_M8
+// |    |  .L_N4_M8_TL1                |
+// |    |  .L_N4_M8_L7                 | KERNEL8x4
+// |    |  .L_N4_M8_L71                |
+// |    |  .L_N4_M8_L0                 |
+// |    .L_N4_M4
+// |    |  .L_N4_M4_TL1                |
+// |    |  .L_N4_M4_L7                 | KERNEL4x4
+// |    |  .L_N4_M4_L71                |
+// |    |  .L_N4_M4_L0                 |
+// |    .L_N4_M2
+// |    |  .L_N4_M2_TL1                |
+// |    |  .L_N4_M2_L7                 | KERNEL2x4
+// |    |  .L_N4_M2_L71                |
+// |    |  .L_N4_M2_L0                 |
+// |    .L_N4_M1
+// |    |  .L_N4_M1_TL1                |
+// |    |  .L_N4_M1_L7                 | KERNEL1x4
+// |    |  .L_N4_M1_L71                |
+// |    |  .L_N4_M1_L0                 |
+// |    .L_N4_M0
+// .L_N3     /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */
+// .L_N2
+// |    .L_N2_M16 <---------------------
+// |    |  .L_N2_M16_TL1               |
+// |    |  .L_N2_M16_L7                | KERNEL16x2
+// |    |  .L_N2_M16_L71               |
+// |    |  .L_N2_M16_L0 ----------------
+// |    .L_N2_M8
+// |    |  .L_N2_M8_TL1                 |
+// |    |  .L_N2_M8_L7                  | KERNEL8x2
+// |    |  .L_N2_M8_L71                 |
+// |    |  .L_N2_M8_L0                  |
+// |    .L_N2_M4
+// |    |  .L_N2_M4_TL1                 |
+// |    |  .L_N2_M4_L7                  | KERNEL4x2
+// |    |  .L_N2_M4_L71                 |
+// |    |  .L_N2_M4_L0                  |
+// |    .L_N2_M2
+// |    |  .L_N2_M2_TL1                 |
+// |    |  .L_N2_M2_L7                  | KERNEL2x2
+// |    |  .L_N2_M2_L71                 |
+// |    |  .L_N2_M2_L0                  |
+// |    .L_N2_M1
+// |    |  .L_N2_M1_TL1                 |
+// |    |  .L_N2_M1_L7                  | KERNEL1x2
+// |    |  .L_N2_M1_L71                 |
+// |    |  .L_N2_M1_L0                  |
+// |    .L_N2_M0
+// .L_N1
+// |    .L_N1_M16 <---------------------
+// |    |  .L_N1_M16_TL1               |
+// |    |  .L_N1_M16_L7                | KERNEL16x1
+// |    |  .L_N1_M16_L71               |
+// |    |  .L_N1_M16_L0 ----------------
+// |    .L_N1_M8
+// |    |  .L_N1_M8_TL1                |
+// |    |  .L_N1_M8_L7                 | KERNEL8x1
+// |    |  .L_N1_M8_L71                |
+// |    |  .L_N1_M8_L0                 |
+// |    .L_N1_M4
+// |    |  .L_N1_M4_TL1                |
+// |    |  .L_N1_M4_L7                 | KERNEL4x1
+// |    |  .L_N1_M4_L71                |
+// |    |  .L_N1_M4_L0                 |
+// |    .L_N1_M2
+// |    |  .L_N1_M2_TL1                |
+// |    |  .L_N1_M2_L7                 | KERNEL2x1
+// |    |  .L_N1_M2_L71                |
+// |    |  .L_N1_M2_L0                 |
+// |    .L_N1_M1
+// |    |  .L_N1_M1_TL1                |
+// |    |  .L_N1_M1_L7                 | KERNEL1x1
+// |    |  .L_N1_M1_L71                |
+// |    |  .L_N1_M1_L0                 |
+// |    .L_N1_M0
+// .L_N0
+
+/*************** sgemm_kernel_macros ***************/
+.macro KERNEL1x16x8_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0
+    preld   0,  C0, 0x00
+    GMUL xvf, s, D2, U0, X1, D3, U1, X1
+    preld   0,  C1, 0x00
+    GMUL xvf, s, D4, U0, X2, D5, U1, X2
+    preld   0,  C2, 0x00
+    GMUL xvf, s, D6, U0, X3, D7, U1, X3
+    preld   0,  C3, 0x00
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMUL xvf, s, D8,  U0, X4, D9,  U1, X4
+    preld   0,  C4, 0x00
+    GMUL xvf, s, D10, U0, X5, D11, U1, X5
+    preld   0,  C5, 0x00
+    GMUL xvf, s, D12, U0, X6, D13, U1, X6
+    preld   0,  C6, 0x00
+    GMUL xvf, s, D14, U0, X7, D15, U1, X7
+    preld   0,  C7, 0x00
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL1x16x8
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
+                  D2, U0, X1, D2, D3, U1, X1, D3
+    preld      0,   A0, A_PRE
+    GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \
+                  D6, U0, X3, D6, D7, U1, X3  D7
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMADD xvf, s, D8,  U0, X4, D8,  D9,  U1, X4, D9,  \
+                  D10, U0, X5, D10, D11, U1, X5, D11
+    //preld      0,   B0, B_PRE
+    GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \
+                  D14, U0, X7, D14, D15, U1, X7  D15
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL8x16x8
+.rept 8
+    KERNEL1x16x8
+.endr
+.endm
+
+.macro SAVE16x8
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA, \
+                 D4,  D4,  VALPHA, D5,  D5,  VALPHA, D6,  D6,  VALPHA, D7,  D7,  VALPHA, \
+                 D8,  D8,  VALPHA, D9,  D9,  VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \
+                 D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+    /* Load C1  */
+    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
+    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
+    /* Load C2  */
+    GLD xv, , X4, C2, 0x00, X5, C2, 0x20
+    GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
+    /* Load C3  */
+    GLD xv, , X6, C3, 0x00, X7, C3, 0x20
+    GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
+    /* Load C4  */
+    GLD xv, , X0, C4, 0x00, X1, C4, 0x20
+    GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1
+    /* Load C5  */
+    GLD xv, , X2, C5, 0x00, X3, C5, 0x20
+    GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3
+    /* Load C6  */
+    GLD xv, , X4, C6, 0x00, X5, C6, 0x20
+    GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5
+    /* Load C7  */
+    GLD xv, , X6, C7, 0x00, X7, C7, 0x20
+    GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
+              D2,  C1, 0x00, D3,  C1, 0x20, \
+              D4,  C2, 0x00, D5,  C2, 0x20, \
+              D6,  C3, 0x00, D7,  C3, 0x20, \
+              D8,  C4, 0x00, D9,  C4, 0x20, \
+              D10, C5, 0x00, D11, C5, 0x20, \
+              D12, C6, 0x00, D13, C6, 0x20, \
+              D14, C7, 0x00, D15, C7, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
+               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
+               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
+#else
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \
+               C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx8_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
+                 D4, U0, X2, D6, U0, X3
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMUL xvf, s, D8,  U0, X4, D10, U0, X5, \
+                 D12, U0, X6, D14, U0, X7
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL1xMx8 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
+                  D4, U0, X2, D4, D6, U0, X3, D6
+    GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C
+    GMADD xvf, s, D8,  U0, X4, D8,  D10, U0, X5, D10, \
+                  D12, U0, X6, D12, D14, U0, X7, D14
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x20
+.endm
+
+.macro KERNEL8xMx8 m, stride
+.rept 8
+    KERNEL1xMx8 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx8 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA, \
+                 D4,  D4,  VALPHA, D6,  D6,  VALPHA, \
+                 D8,  D8,  VALPHA, D10, D10, VALPHA, \
+                 D12, D12, VALPHA, D14, D14, VALPHA
+#else
+    /* Load C0, C1, C2, C3, C4, C5, C6, C7 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
+                  D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
+.if \m == 8
+    GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00
+.elseif \m == 4
+    GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00
+.endif
+    GMADD xvf, s, D8,  D8,  VALPHA, X0, D10, D10, VALPHA, X2, \
+                  D12, D12, VALPHA, X4, D14, D14, VALPHA, X6
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00, \
+              D4,  C2, 0x00, D6,  C3, 0x00, \
+              D8,  C4, 0x00, D10, C5, 0x00, \
+              D12, C6, 0x00, D14, C7, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
+             $vr14, C2, 0x00, $vr16, C3, 0x00, \
+             $vr18, C4, 0x00, $vr20, C5, 0x00, \
+             $vr22, C6, 0x00, $vr24, C7, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00, \
+              $f18, C4, 0x00, $f20, C5, 0x00, \
+              $f22, C6, 0x00, $f24, C7, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00, \
+              $f18, C4, 0x00, $f20, C5, 0x00, \
+              $f22, C6, 0x00, $f24, C7, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
+               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
+               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
+#else
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \
+               C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride
+#endif
+.endm
+
+.macro KERNEL1x16x4_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
+                 D2, U0, X1, D3, U1, X1, \
+                 D4, U0, X2, D5, U1, X2, \
+                 D6, U0, X3, D7, U1, X3
+   PTR_ADDI   A0,     A0,     0x40
+   PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL1x16x4
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
+                  D2, U0, X1, D2, D3, U1, X1, D3, \
+                  D4, U0, X2, D4, D5, U1, X2, D5, \
+                  D6, U0, X3, D6, D7, U1, X3  D7
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL8x16x4
+.rept 8
+    KERNEL1x16x4
+.endr
+.endm
+
+.macro SAVE16x4
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA, \
+                 D4,  D4,  VALPHA, D5,  D5,  VALPHA, D6,  D6,  VALPHA, D7,  D7,  VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+    /* Load C1  */
+    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
+    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
+    /* Load C2  */
+    GLD xv, , X4, C2, 0x00, X5, C2, 0x20
+    GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5
+    /* Load C3  */
+    GLD xv, , X6, C3, 0x00, X7, C3, 0x20
+    GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
+              D2,  C1, 0x00, D3,  C1, 0x20, \
+              D4,  C2, 0x00, D5,  C2, 0x20, \
+              D6,  C3, 0x00, D7,  C3, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
+#else
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx4_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMUL xvf, s, D0, U0, X0, D2, U0, X1, \
+                 D4, U0, X2, D6, U0, X3
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL1xMx4 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C
+    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \
+                  D4, U0, X2, D4, D6, U0, X3, D6
+   PTR_ADDI   A0,     A0,     \stride
+   PTR_ADDI   B0,     B0,     0x10
+.endm
+
+.macro KERNEL8xMx4 m, stride
+.rept 8
+    KERNEL1xMx4 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx4 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA, \
+                 D4,  D4,  VALPHA, D6,  D6,  VALPHA
+#else
+    /* Load C0, C1, C2, C3 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \
+                  D4, D4, VALPHA, X4, D6, D6, VALPHA, X6
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00, \
+              D4,  C2, 0x00, D6,  C3, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \
+             $vr14, C2, 0x00, $vr16, C3, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \
+              $f14, C2, 0x00, $f16, C3, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
+#else
+    GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride
+#endif
+.endm
+
+.macro KERNEL1x16x2_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0, \
+                 D2, U0, X1, D3, U1, X1
+   PTR_ADDI   A0,     A0,     0x40
+   PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL1x16x2
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \
+                  D2, U0, X1, D2, D3, U1, X1, D3
+   PTR_ADDI   A0,     A0,     0x40
+   PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL8x16x2
+.rept 8
+    KERNEL1x16x2
+.endr
+.endm
+
+.macro SAVE16x2
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA, D2,  D2,  VALPHA, D3,  D3,  VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+    /* Load C1  */
+    GLD xv, , X2, C1, 0x00, X3, C1, 0x20
+    GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20, \
+              D2,  C1, 0x00, D3,  C1, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, 0x40, C1, C1, 0x40
+#else
+    GADDI , d, C0, C0, 0x40, C1, C1, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx2_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMUL xvf, s, D0, U0, X0, D2, U0, X1
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL1xMx2 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04
+    GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x08
+.endm
+
+.macro KERNEL8xMx2 m, stride
+.rept 8
+    KERNEL1xMx2 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx2 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D2,  D2,  VALPHA
+#else
+    /* Load C0, C1 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00, X2, C1, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00, D2,  C1, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00, $f12, C1, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00, $f12, C1, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride, C1, C1, \stride
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, \stride, C1, C1, \stride
+#else
+    GADDI , d, C0, C0, \stride, C1, C1, \stride
+#endif
+.endm
+
+.macro KERNEL1x16x1_START
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+    GLDREPL xv, w, X0, B0, 0x00
+    GMUL xvf, s, D0, U0, X0, D1, U1, X0
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL1x16x1
+    GLD xv, , U0, A0, 0x00, U1, A0, 0x20
+    GLDREPL xv, w, X0, B0, 0x00
+    GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1
+    PTR_ADDI   A0,     A0,     0x40
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL8x16x1
+.rept 8
+    KERNEL1x16x1
+.endr
+.endm
+
+.macro SAVE16x1
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA, D1,  D1,  VALPHA
+#else
+    /* Load C0  */
+    GLD xv, , X0, C0, 0x00, X1, C0, 0x20
+    GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1
+#endif // #if defined(TRMMKERNEL)
+    GST xv, , D0,  C0, 0x00, D1,  C0, 0x20
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, 0x40
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, 0x40
+#else
+    GADDI , d, C0, C0, 0x40
+#endif
+.endm
+
+// m = 8, 4, 2, 1
+// stride = 0x20, 0x10, 0x08, 0x04
+.macro KERNEL1xMx1_START m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00
+    GMUL xvf, s, D0, U0, X0
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL1xMx1 m, stride
+.if \m == 8
+    GLD xv, , U0, A0, 0x00
+.elseif \m == 4
+    GLD v, , $vr0, A0, 0x00
+.elseif \m ==2
+    GLD f, d, $f0, A0, 0x00
+.elseif \m ==1
+    GLD f, s, $f0, A0, 0x00
+.endif
+    GLDREPL xv, w, X0, B0, 0x00
+    GMADD xvf, s, D0, U0, X0, D0
+    PTR_ADDI   A0,     A0,     \stride
+    PTR_ADDI   B0,     B0,     0x04
+.endm
+
+.macro KERNEL8xMx1 m, stride
+.rept 8
+    KERNEL1xMx1 \m, \stride
+.endr
+.endm
+
+.macro SAVEMx1 m, stride
+#if defined(TRMMKERNEL)
+    GMUL xvf, s, D0,  D0,  VALPHA
+#else
+    /* Load C0, C1 */
+ .if \m == 8
+    GLD xv, , X0, C0, 0x00
+ .elseif \m == 4
+    GLD v, , $vr2, C0, 0x00
+.elseif \m == 2
+    GLD f, d, $f2, C0, 0x00
+.elseif \m == 1
+    GLD f, s, $f2, C0, 0x00
+ .endif
+    GMADD xvf, s, D0, D0, VALPHA, X0
+#endif // #if defined(TRMMKERNEL)
+.if \m == 8
+    GST xv, , D0,  C0, 0x00
+.elseif \m == 4
+    GST v, , $vr10, C0, 0x00
+.elseif \m == 2
+    GST f, d, $f10, C0, 0x00
+.elseif \m == 1
+    GST f, s, $f10, C0, 0x00
+.endif
+#if __loongarch_grlen == 64
+    GADDI , d, C0, C0, \stride
+#elif __loongarch_grlen == 32
+    GADDI , w, C0, C0, \stride
+#else
+    GADDI , d, C0, C0, \stride
+#endif
+.endm
+
+    PROLOGUE
+    push_if_used 26, 32
+    xvreplve0.w   VALPHA,     $xr0
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    PTR_SUB   OFF,   ZERO,  OFFSET
+#else
+    xor     OFF,   OFF,   OFF
+#endif
+    /* if (!(N >> 3)) goto L_N7 */
+    PTR_SRAI   J,     N,     3     /* J = bn >> 3 */
+    andi     N,     N,     0x07
+    beq      ZERO,  J,     .L_N7
+.L_N8: /* J -- */
+    move     C0,    C
+    move     A0,    A
+    PTR_SLLI   T0,    LDC,   2
+    PTR_ADDI   J,     J,     -1   /* J-- */
+#if __loongarch_grlen == 64
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
+              C6, C5, T0, C7, C6, T0
+#elif __loongarch_grlen == 32
+    GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
+              C6, C5, T0, C7, C6, T0
+#else
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \
+              C6, C5, T0, C7, C6, T0
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+    /* if (!(M >> 4)) goto L_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_M8
+.align 5
+.L_M16: /* I-- */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0 /* A0 += 16 * OFF */
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x8_START
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M16_L7 */
+    beq       ZERO,TL, .L_M16_L7
+.align 5
+.L_M16_TL1:
+    KERNEL8x16x8
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M16_TL1
+.L_M16_L7:
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M16_L0
+.align 5
+.L_M16_L71:
+    KERNEL1x16x8
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M16_L71
+.L_M16_L0:
+    SAVE16x8
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values  in A */
+    PTR_ADDI    L,   L,   -16
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF, OFF, 0x10 /* number of values in A */
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_M16
+.L_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_M4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0 /* A0 += 8 * OFF */
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
+#endif
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif  // #if defined(TRMMKERNEL)
+    KERNEL1xMx8_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M8_L7 */
+    beq       ZERO,TL, .L_M8_L7
+.align 5
+.L_M8_TL1:
+    KERNEL8xMx8 8, 0x20
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M8_TL1
+.L_M8_L7:
+    /* if (!(L & 7)) goto L_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M8_L0
+.align 5
+.L_M8_L71:
+    KERNEL1xMx8 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M8_L71
+.L_M8_L0:
+    SAVEMx8 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -8
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_M2
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0 /* A0 += 4 * OFF */
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0 /* B0 = B + 8 * OFF */
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx8_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M4_L7 */
+    beq       ZERO,TL, .L_M4_L7
+.align 5
+.L_M4_TL1:
+    KERNEL8xMx8 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_TL1
+.L_M4_L7:
+    /* if (!(L & 7)) goto L_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M4_L0
+.L_M4_L71:
+    KERNEL1xMx8 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M4_L71
+.L_M4_L0:
+    SAVEMx8 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -4
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx8_START 2, 0x08
+
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M2_L7 */
+    beq       ZERO,TL, .L_M2_L7
+.align 5
+.L_M2_TL1:
+    KERNEL8xMx8 2, 0x08
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M2_TL1
+.L_M2_L7:
+    /* if (!(L & 7)) goto L_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M2_L0
+.align 5
+.L_M2_L71:
+    KERNEL1xMx8 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M2_L71
+.L_M2_L0:
+    SAVEMx8 2, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -2
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  8
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx8_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_M1_L7 */
+    beq       ZERO,TL, .L_M1_L7
+.align 5
+.L_M1_TL1:
+    KERNEL8xMx8 1, 0x04
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_M1_TL1
+.L_M1_L7:
+    /* if (!(L & 7)) goto L_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_M1_L0
+.align 5
+.L_M1_L71:
+    KERNEL1xMx8 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_M1_L71
+.L_M1_L0:
+    SAVEMx8 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    L,   L,   -1
+#else
+    /* number of values in B */
+    PTR_ADDI    L,   L,   -8
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    /* number of values in A */
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L_M0:
+    /* Add stride for B and C
+     * B += (K * 32)
+     * C += (LDC * 32)
+     */
+    PTR_SLLI    T0,   K,   5
+    PTR_SLLI    T1,   LDC, 5
+    PTR_ADD     B,    B,   T0
+    PTR_ADD     C,    C,   T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    PTR_ADDI    OFF,  OFF, 0x08 /* number of values in B */
+#endif
+    blt      ZERO,  J,   .L_N8
+
+.L_N7:
+    andi     J,    N,   4
+    beq      ZERO, J,   .L_N3
+.L_N4:
+    move     C0,    C
+    move     A0,    A
+    PTR_SLLI   T0,    LDC,   2
+#if __loongarch_grlen == 64
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
+#elif __loongarch_grlen == 32
+    GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0
+#else
+    GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N4_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N4_M8
+.align 5
+.L_N4_M16:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0 /* A0 += 16 * OFF */
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0 /* B0 += 4 * OFF */
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x4_START
+
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_L7 */
+    beq       ZERO,TL, .L_N4_M16_L7
+.align 5
+.L_N4_M16_TL1: /* TL-- */
+    KERNEL8x16x4
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M16_TL1
+.L_N4_M16_L7:
+    /* if (!(L & 7)) goto L_N4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M16_L0
+.align 5
+.L_N4_M16_L71:
+    KERNEL1x16x4
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M16_L71
+.L_N4_M16_L0:
+    SAVE16x4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -16
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N4_M16
+.L_N4_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N4_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N4_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M8_L7 */
+    beq       ZERO,TL, .L_N4_M8_L7
+.align 5
+.L_N4_M8_TL1: /* TL-- */
+    KERNEL8xMx4 8, 0x20
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M8_TL1
+.L_N4_M8_L7:
+    /* if (!(L & 7)) goto L_N4_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M8_L0
+.align 5
+.L_N4_M8_L71:
+    KERNEL1xMx4 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M8_L71
+.L_N4_M8_L0:
+    SAVEMx4 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -8
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N4_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M4_L7 */
+    beq       ZERO,TL, .L_N4_M4_L7
+.align 5
+.L_N4_M4_TL1: /* TL-- */
+    KERNEL8xMx4 4, 0x10
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M4_TL1
+.L_N4_M4_L7:
+    /* if (!(L & 7)) goto L_N4_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M4_L0
+.align 5
+.L_N4_M4_L71:
+    KERNEL1xMx4 4, 0x10
+
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M4_L71
+.L_N4_M4_L0:
+    SAVEMx4 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -4
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N4_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 2, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M2_L7 */
+    beq       ZERO,TL, .L_N4_M2_L7
+.align 5
+.L_N4_M2_TL1: /* TL-- */
+    KERNEL8xMx4 2, 0x08
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M2_TL1
+.L_N4_M2_L7:
+    /* if (!(L & 7)) goto L_N4_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M2_L0
+.align 5
+.L_N4_M2_L71:
+    KERNEL1xMx4 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M2_L71
+.L_N4_M2_L0:
+    SAVEMx4 2, 0x08
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -2
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N4_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  4
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx4_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N4_M1_L7 */
+    beq       ZERO,TL, .L_N4_M1_L7
+.align 5
+.L_N4_M1_TL1: /* TL-- */
+    KERNEL8xMx4 1, 0x04
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N4_M1_TL1
+.L_N4_M1_L7:
+    /* if (!(L & 7)) goto L_N4_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N4_M1_L0
+.align 5
+.L_N4_M1_L71:
+    KERNEL1xMx4 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N4_M1_L71
+.L_N4_M1_L0:
+    SAVEMx4 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -1
+#else
+    PTR_ADDI    L,   L,   -4
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N4_M0:
+    /* Add stride for B and C
+     * B += 4 * K
+     * C += 4 * LDC
+     */
+    PTR_SLLI    T0,   K,   4
+    PTR_SLLI    T1,   LDC, 4
+    PTR_ADD     B,    B,   T0
+    PTR_ADD     C,    C,   T1
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    PTR_ADDI    OFF,  OFF, 0x04
+#endif
+    /* We must reinit I */
+    PTR_SRAI   I,     M,   4     /* I = bm >> 4 */
+.L_N3:
+    andi     J,    N,   2
+    beq      ZERO, J,   .L_N1
+
+.L_N2:
+    move     C0,    C
+    move     A0,    A
+    PTR_SLLI   T0,    LDC,   2
+    PTR_ADD    C1,    C0,    T0
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+
+    /* if (!(M >> 4)) goto L_N2_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N2_M8
+.align 5
+.L_N2_M16:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x2_START
+
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M16_L7 */
+    beq       ZERO,TL, .L_N2_M16_L7
+.align 5
+.L_N2_M16_TL1: /* TL-- */
+    KERNEL8x16x2
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M16_TL1
+.L_N2_M16_L7:
+    /* if (!(L & 7)) goto L_N2_M16_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M16_L0
+.align 5
+.L_N2_M16_L71:
+    KERNEL1x16x2
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M16_L71
+.L_N2_M16_L0:
+    SAVE16x2
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -16
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N2_M16
+.L_N2_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N2_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N2_M4
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M8_L7 */
+    beq       ZERO,TL, .L_N2_M8_L7
+.align 5
+.L_N2_M8_TL1: /* TL-- */
+    KERNEL8xMx2 8, 0x20
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M8_TL1
+.L_N2_M8_L7:
+    /* if (!(L & 7)) goto L_N2_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M8_L0
+.align 5
+.L_N2_M8_L71:
+    KERNEL1xMx2 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M8_L71
+.L_N2_M8_L0:
+    SAVEMx2 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -8
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N2_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M4_L7 */
+    beq       ZERO,TL, .L_N2_M4_L7
+.align 5
+.L_N2_M4_TL1: /* TL-- */
+    KERNEL8xMx2 4, 0x10
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M4_TL1
+.L_N2_M4_L7:
+    /* if (!(L & 7)) goto L_N2_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M4_L0
+.align 5
+.L_N2_M4_L71:
+    KERNEL1xMx2 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M4_L71
+.L_N2_M4_L0:
+    SAVEMx2 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -4
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N2_M1
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 2, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M2_L7 */
+    beq       ZERO,TL, .L_N2_M2_L7
+.align 5
+.L_N2_M2_TL1: /* TL-- */
+    KERNEL8xMx2 2, 0x08
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M2_TL1
+.L_N2_M2_L7:
+    /* if (!(L & 7)) goto L_N2_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M2_L0
+.align 5
+.L_N2_M2_L71:
+    KERNEL1xMx2 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M2_L71
+.L_N2_M2_L0:
+    SAVEMx2 2, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -2
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N2_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  2
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx2_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N2_M1_L7 */
+    beq       ZERO,TL, .L_N2_M1_L7
+.align 5
+.L_N2_M1_TL1: /* TL-- */
+    KERNEL8xMx2 1, 0x04
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N2_M1_TL1
+.L_N2_M1_L7:
+    /* if (!(L & 7)) goto L_N2_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N2_M1_L0
+.align 5
+.L_N2_M1_L71:
+    KERNEL1xMx2 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N2_M1_L71
+.L_N2_M1_L0:
+    SAVEMx2 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -1
+#else
+    PTR_ADDI    L,   L,   -2
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N2_M0:
+    /* Add stride for B and C
+     * B += 2 * K
+     * C += 2 * LDC
+     */
+    PTR_SLLI    T0,   K,   3
+    PTR_SLLI    T1,   LDC, 3
+    PTR_ADD     B,    B,   T0
+    PTR_ADD     C,    C,   T1
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    PTR_ADDI    OFF,  OFF, 0x02
+#endif
+    /* We must reinit I */
+    PTR_SRAI   I,     M,   4     /* I = bm >> 4 */
+.L_N1:
+    andi     J,    N,   1
+    beq      ZERO, J,   .L_N0
+    move     C0,    C
+    move     A0,    A
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move     OFF,   OFFSET
+#endif
+    /* if (!(M >> 4)) goto L_N1_M8 */
+    PTR_SRAI   I,     M,     4     /* I = bm >> 4 */
+    beq      ZERO,  I,     .L_N1_M8
+.L_N1_M16:
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x06
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  16
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1x16x1_START
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M16_L7 */
+    beq       ZERO,TL, .L_N1_M16_L7
+.align 5
+.L_N1_M16_TL1: /* TL-- */
+    KERNEL8x16x1
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M16_TL1
+.L_N1_M16_L7:
+    /* if (!(L & 7)) goto L_N1_M16_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M16_L0
+.align 5
+.L_N1_M16_L71:
+    KERNEL1x16x1
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M16_L71
+.L_N1_M16_L0:
+    SAVE16x1
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -16
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x06
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x10
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    PTR_ADDI    I,   I,   -1  /* I-- */
+    blt       ZERO,I,   .L_N1_M16
+.L_N1_M8:
+    /* We have done M & 16, considering M=8/4/2/1 */
+    andi      I,   M,   15
+    beq       ZERO,I,   .L_N1_M0
+
+    andi      I,   M,   8
+    beq       ZERO,I,   .L_N1_M4
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x05
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  8
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 8, 0x20
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M8_L7 */
+    beq       ZERO,TL, .L_N1_M8_L7
+.align 5
+.L_N1_M8_TL1: /* TL-- */
+    KERNEL8xMx1 8, 0x20
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M8_TL1
+.L_N1_M8_L7:
+    /* if (!(L & 7)) goto L_N1_M8_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M8_L0
+.align 5
+.L_N1_M8_L71:
+    KERNEL1xMx1 8, 0x20
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M8_L71
+.L_N1_M8_L0:
+    SAVEMx1 8, 0x20
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -8
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x05
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x08
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N1_M4:
+    andi      I,   M,   4
+    beq       ZERO,I,   .L_N1_M2
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x04
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  4
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 4, 0x10
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M4_L7 */
+    beq       ZERO,TL, .L_N1_M4_L7
+.align 5
+.L_N1_M4_TL1: /* TL-- */
+    KERNEL8xMx1 4, 0x10
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M4_TL1
+.L_N1_M4_L7:
+    /* if (!(L & 7)) goto L_N1_M4_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M4_L0
+.align 5
+.L_N1_M4_L71:
+    KERNEL1xMx1 4, 0x10
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M4_L71
+.L_N1_M4_L0:
+    SAVEMx1 4, 0x10
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -4
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x04
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x04
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N1_M2:
+    andi      I,   M,   2
+    beq       ZERO,I,   .L_N1_M1
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x03
+    PTR_ADD    A0,    A0,   T0
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  2
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 2, 0x08
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M2_L7 */
+    beq       ZERO,TL, .L_N1_M2_L7
+.align 5
+.L_N1_M2_TL1: /* TL-- */
+    KERNEL8xMx1 2, 0x08
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M2_TL1
+.L_N1_M2_L7:
+    /* if (!(L & 7)) goto L_N1_M2_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M2_L0
+.align 5
+.L_N1_M2_L71:
+    KERNEL1xMx1 2, 0x08
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M2_L71
+.L_N1_M2_L0:
+    SAVEMx1 2, 0x08
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -2
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x03
+    PTR_ADD     A0,  A0, T0
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x02
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L_N1_M1:
+    andi      I,   M,   1
+    beq       ZERO,I,   .L_N1_M0
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move     B0,    B
+#else
+    PTR_SLLI   T0,    OFF,  0x02
+    PTR_ADD    A0,    A0,   T0
+    PTR_ADD    B0,    B,    T0
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    PTR_SUB    L,     K,    OFF
+#elif defined(LEFT)
+    /* number of values in A */
+    PTR_ADDI   L,     OFF,  1
+#else
+    /* number of values in B */
+    PTR_ADDI   L,     OFF,  1
+#endif
+#else   // #if !defined(TRMMKERNEL)
+    move     B0,    B
+    move     L,     K /* L = bk */
+#endif
+    KERNEL1xMx1_START 1, 0x04
+    /* Reduce L */
+    PTR_ADDI    L,   L,  -1
+    PTR_SRAI    TL,  L,  3  /* TL = (L-1) >> 3 */
+    /* if (TL < 1) goto L_N1_M1_L7 */
+    beq       ZERO,TL, .L_N1_M1_L7
+.align 5
+.L_N1_M1_TL1: /* TL-- */
+    KERNEL8xMx1 1, 0x04
+
+    PTR_ADDI    TL,  TL, -1 /* TL-- */
+    blt       ZERO,TL, .L_N1_M1_TL1
+.L_N1_M1_L7:
+    /* if (!(L & 7)) goto L_N1_M1_L0 */
+    andi      TL,  L,   7
+    beq       TL,  ZERO,.L_N1_M1_L0
+.align 5
+.L_N1_M1_L71:
+    KERNEL1xMx1 1, 0x04
+    PTR_ADDI     TL,  TL, -1
+    blt        ZERO,TL, .L_N1_M1_L71
+.L_N1_M1_L0:
+    SAVEMx1 1, 0x04
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    PTR_SUB     L,   K,   OFF
+#ifdef LEFT
+    PTR_ADDI    L,   L,   -1
+#else
+    PTR_ADDI    L,   L,   -1
+#endif
+    PTR_SLLI    T0,  L,  0x02
+    PTR_ADD     A0,  A0, T0
+    PTR_ADD     B0,  B0, T0
+#endif
+
+#ifdef LEFT
+    PTR_ADDI    OFF,   OFF,  0x01
+#endif
+#endif   // #if defined(TRMMKERNEL)
+.L_N1_M0:
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S
new file mode 100644
index 000000000..266c07c5c
--- /dev/null
+++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S
@@ -0,0 +1,463 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define S11    $r24
+#define S12    $r25
+#define S13    $r26
+#define S14    $r27
+#define S15    $r28
+#define S16    $r29
+#define TD     $r30
+#define TS     $r31
+#define TL     $r7
+#define T0     $r6
+#undef  ZERO
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define D14    $xr30
+#define D15    $xr31
+
+// Loops outline
+//.L_N16 <-------------------
+//|   .L_M8:                |
+//|   .L_M7:                | Main Loop
+//|   .L_M1:                |
+//|   .L_M0: ---------------
+//.L_N15:
+//.L_N8:
+//|   .L_N8_M8:
+//|   .L_N8_M7:
+//|   .L_N8_M1:
+//.L_N7:
+//.L_N4:
+//|   .L_N4_M4:
+//|   .L_N4_M3:
+//|   .L_N4_M1:
+//.L_N3:
+//.L_N2:
+//|   .L_N2_M2:
+//|   .L_N2_M1:
+//.L_N1:
+//|   .L_N1_M1:
+//.L_N0
+
+    PROLOGUE
+    push_if_used 26, 32
+
+    move       TD,   DST
+    move       TS,   SRC
+    PTR_SLLI   TL,   LDA,  0x02
+    PTR_SLLI   T0,   TL,   0x01
+    PTR_SRAI   J,    N,    0x04
+    beq        J,    ZERO, .L_N15
+.align 5
+.L_N16:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x03
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADDI     J,    J,    -1
+    PTR_ADD      S4,   S3,   TL
+    PTR_ADD      S5,   S3,   T0
+    PTR_ADD      S6,   S4,   T0
+    PTR_ADD      S7,   S5,   T0
+    PTR_ADD      S8,   S6,   T0
+    PTR_ADD      S9,   S7,   T0
+    PTR_ADD      S10,  S8,   T0
+    PTR_ADD      S11,  S9,   T0
+    PTR_ADD      S12,  S10,  T0
+    PTR_ADD      S13,  S11,  T0
+    PTR_ADD      S14,  S12,  T0
+    PTR_ADD      S15,  S13,  T0
+    PTR_ADD      S16,  S14,  T0
+    PTR_ADD      TS,   S15,  T0
+    beq          I,    ZERO, .L_M7
+.align 5
+.L_M8:
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+    xvld       U8,   S9,   0x00
+    xvld       U9,   S10,  0x00
+    xvld       U10,  S11,  0x00
+    xvld       U11,  S12,  0x00
+    xvld       U12,  S13,  0x00
+    xvld       U13,  S14,  0x00
+    xvld       U14,  S15,  0x00
+    xvld       U15,  S16,  0x00
+
+    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
+                    U0, U1, U2, U3, U4, U5, U6, U7, \
+                    D1, D3, D5, D7 // As tmp
+    GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \
+                    U8, U9, U10, U11, U12, U13, U14, U15, \
+                    U0, U1, U2, U3 // As tmp
+    GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \
+              D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    GST xv, , D8,  TD, 0x00, D9,  TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \
+              D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    PTR_ADDI     S1,   S1,   0x20
+    PTR_ADDI     S2,   S2,   0x20
+    PTR_ADDI     S3,   S3,   0x20
+    PTR_ADDI     S4,   S4,   0x20
+    PTR_ADDI     S5,   S5,   0x20
+    PTR_ADDI     S6,   S6,   0x20
+    PTR_ADDI     S7,   S7,   0x20
+    PTR_ADDI     S8,   S8,   0x20
+    PTR_ADDI     S9,   S9,   0x20
+    PTR_ADDI     S10,  S10,  0x20
+    PTR_ADDI     S11,  S11,  0x20
+    PTR_ADDI     S12,  S12,  0x20
+    PTR_ADDI     S13,  S13,  0x20
+    PTR_ADDI     S14,  S14,  0x20
+    PTR_ADDI     S15,  S15,  0x20
+    PTR_ADDI     S16,  S16,  0x20
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_M8
+.L_M7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_M0
+.align 5
+.L_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+    fld.s     F4,    S5,  0x00
+    fld.s     F5,    S6,  0x00
+    fld.s     F6,    S7,  0x00
+    fld.s     F7,    S8,  0x00
+
+    fst.s     F0,    TD,  0x00
+    fst.s     F1,    TD,  0x04
+    fst.s     F2,    TD,  0x08
+    fst.s     F3,    TD,  0x0C
+    fst.s     F4,    TD,  0x10
+    fst.s     F5,    TD,  0x14
+    fst.s     F6,    TD,  0x18
+    fst.s     F7,    TD,  0x1C
+
+    PTR_ADDI    S1,    S1,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    PTR_ADDI    S3,    S3,  0x04
+    PTR_ADDI    S4,    S4,  0x04
+    PTR_ADDI    S5,    S5,  0x04
+    PTR_ADDI    S6,    S6,  0x04
+    PTR_ADDI    S7,    S7,  0x04
+    PTR_ADDI    S8,    S8,  0x04
+    PTR_ADDI    TD,    TD,  0x20
+
+    fld.s     F0,    S9,  0x00
+    fld.s     F1,    S10, 0x00
+    fld.s     F2,    S11, 0x00
+    fld.s     F3,    S12, 0x00
+    fld.s     F4,    S13, 0x00
+    fld.s     F5,    S14, 0x00
+    fld.s     F6,    S15, 0x00
+    fld.s     F7,    S16, 0x00
+
+    fst.s     F0,    TD,  0x00
+    fst.s     F1,    TD,  0x04
+    fst.s     F2,    TD,  0x08
+    fst.s     F3,    TD,  0x0C
+    fst.s     F4,    TD,  0x10
+    fst.s     F5,    TD,  0x14
+    fst.s     F6,    TD,  0x18
+    fst.s     F7,    TD,  0x1C
+
+    PTR_ADDI    S9,    S9,  0x04
+    PTR_ADDI    S10,   S10, 0x04
+    PTR_ADDI    S11,   S11, 0x04
+    PTR_ADDI    S12,   S12, 0x04
+    PTR_ADDI    S13,   S13, 0x04
+    PTR_ADDI    S14,   S14, 0x04
+    PTR_ADDI    S15,   S15, 0x04
+    PTR_ADDI    S16,   S16, 0x04
+    PTR_ADDI    TD,    TD,  0x20
+
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_M1
+.L_M0:
+    blt       ZERO,  J,   .L_N16
+.L_N15:
+    andi      J,     N,   0x0f
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x08
+    beq       ZERO,  J,   .L_N7
+.L_N8:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x03
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      S5,   S3,   T0
+    PTR_ADD      S6,   S4,   T0
+    PTR_ADD      S7,   S5,   T0
+    PTR_ADD      S8,   S6,   T0
+    PTR_ADD      TS,   S7,   T0
+    beq          I,    ZERO, .L_N8_M7
+.align 5
+.L_N8_M8:
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
+                    U0, U1, U2, U3, U4, U5, U6, U7, \
+                    D1, D3, D5, D7 // As tmp
+    GST xv, , D0, TD, 0x00, D2,  TD, 0x20, D4,  TD, 0x40, D6,  TD, 0x60, \
+              D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    PTR_ADDI     S1,   S1,   0x20
+    PTR_ADDI     S2,   S2,   0x20
+    PTR_ADDI     S3,   S3,   0x20
+    PTR_ADDI     S4,   S4,   0x20
+    PTR_ADDI     S5,   S5,   0x20
+    PTR_ADDI     S6,   S6,   0x20
+    PTR_ADDI     S7,   S7,   0x20
+    PTR_ADDI     S8,   S8,   0x20
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N8_M8
+.L_N8_M7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_N7
+.align 5
+.L_N8_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+    fld.s     F4,    S5,  0x00
+    fld.s     F5,    S6,  0x00
+    fld.s     F6,    S7,  0x00
+    fld.s     F7,    S8,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+    fst.s       F4,    TD,  0x10
+    PTR_ADDI    S5,    S5,  0x04
+    fst.s       F5,    TD,  0x14
+    PTR_ADDI    S6,    S6,  0x04
+    fst.s       F6,    TD,  0x18
+    PTR_ADDI    S7,    S7,  0x04
+    fst.s       F7,    TD,  0x1C
+    PTR_ADDI    S8,    S8,  0x04
+
+    PTR_ADDI    TD,    TD,  0x20
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_N8_M1
+.L_N7:
+    andi      J,     N,   0x07
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N3
+.L_N4:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x02
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      TS,   S3,   T0
+    beq          I,    ZERO, .L_N4_M3
+.align 5
+.L_N4_M4:
+    GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
+    GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
+    GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
+    GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
+    GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
+    GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
+    PTR_ADDI     S1,   S1,   0x10
+    PTR_ADDI     S2,   S2,   0x10
+    PTR_ADDI     S3,   S3,   0x10
+    PTR_ADDI     S4,   S4,   0x10
+    PTR_ADDI     TD,   TD,   0x40
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N4_M4
+.L_N4_M3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N3
+.align 5
+.L_N4_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+
+    PTR_ADDI    TD,    TD,  0x10
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_N4_M1
+.L_N3:
+    andi      J,     N,   0x03
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+.L_N2:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x01
+    PTR_ADD      TS,   S2,   TL
+    beq          I,    ZERO, .L_N2_M1
+.align 5
+.L_N2_M2:
+    GLD f, d, F0, S1, 0x00, F1, S2, 0x00
+    vilvl.w $vr0, $vr1, $vr0
+    GST v, , $vr0, TD, 0x00
+    PTR_ADDI     S1,   S1,   0x08
+    PTR_ADDI     S2,   S2,   0x08
+    PTR_ADDI     TD,   TD,   0x10
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N2_M2
+.L_N2_M1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    PTR_ADDI    TD,    TD,  0x08
+.align 5
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_N1_M1:
+    fld.s       F0,    S1,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    TD,    TD,  0x04
+    PTR_ADDI    M,     M,   -1
+    blt         ZERO,  M,   .L_N1_M1
+.L_N0:
+    pop_if_used 26, 32
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S
new file mode 100644
index 000000000..5c173568b
--- /dev/null
+++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S
@@ -0,0 +1,298 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r6
+#undef  ZERO
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+#define D10    $xr17
+#define D12    $xr18
+#define D14    $xr19
+
+// Loops outline
+//.L_N8: <----------------
+//|   .L_M8:              |
+//|   .L_M7:              | Main Loop
+//|   .L_M1:              |
+//|   .L_M0:--------------
+//.L_N7:
+//.L_N4:
+//|   .L_N4_M4:
+//|   .L_N4_M3:
+//|   .L_N4_M1:
+//.L_N3:
+//.L_N2:
+//|   .L_N2_M2:
+//|   .L_N2_M1:
+//.L_N1:
+//|   .L_N1_M1:
+//.L_N0
+
+    PROLOGUE
+    push_if_used 17, 20
+
+    move       TD,   DST
+    move       TS,   SRC
+    PTR_SLLI   TL,   LDA,  0x02
+    PTR_SLLI   T0,   TL,   0x01
+    PTR_SRAI   J,    N,    0x03
+    beq        J,    ZERO, .L_N7
+.align 5
+.L_N8:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x03
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADDI     J,    J,    -1
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      S5,   S3,   T0
+    PTR_ADD      S6,   S4,   T0
+    PTR_ADD      S7,   S5,   T0
+    PTR_ADD      S8,   S6,   T0
+    PTR_ADD      TS,   S7,   T0
+    beq          I,    ZERO, .L_M7
+.align 5
+.L_M8:
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+    xvld       U4,   S5,   0x00
+    xvld       U5,   S6,   0x00
+    xvld       U6,   S7,   0x00
+    xvld       U7,   S8,   0x00
+
+    GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
+                    U0, U1, U2, U3, U4, U5, U6, U7, \
+                    D1, D3, D5, D7 // As tmp
+    GST xv, , D0, TD, 0x00, D2,  TD, 0x20, D4,  TD, 0x40, D6,  TD, 0x60, \
+              D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
+    PTR_ADDI  TD, TD, 0x100
+    PTR_ADDI     S1,   S1,   0x20
+    PTR_ADDI     S2,   S2,   0x20
+    PTR_ADDI     S3,   S3,   0x20
+    PTR_ADDI     S4,   S4,   0x20
+    PTR_ADDI     S5,   S5,   0x20
+    PTR_ADDI     S6,   S6,   0x20
+    PTR_ADDI     S7,   S7,   0x20
+    PTR_ADDI     S8,   S8,   0x20
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_M8
+.L_M7:
+    andi      I,     M,    0x07
+    beq       I,     ZERO, .L_M0
+.align 5
+.L_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+    fld.s     F4,    S5,  0x00
+    fld.s     F5,    S6,  0x00
+    fld.s     F6,    S7,  0x00
+    fld.s     F7,    S8,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+    fst.s       F4,    TD,  0x10
+    PTR_ADDI    S5,    S5,  0x04
+    fst.s       F5,    TD,  0x14
+    PTR_ADDI    S6,    S6,  0x04
+    fst.s       F6,    TD,  0x18
+    PTR_ADDI    S7,    S7,  0x04
+    fst.s       F7,    TD,  0x1C
+    PTR_ADDI    S8,    S8,  0x04
+
+    PTR_ADDI    TD,    TD,  0x20
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_M1
+.L_M0:
+    blt       ZERO,  J,   .L_N8
+.L_N7:
+    andi      J,     N,   0x07
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x04
+    beq       ZERO,  J,   .L_N3
+.L_N4:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x02
+    PTR_ADD      S3,   S2,   TL
+    PTR_ADD      S4,   S2,   T0
+    PTR_ADD      TS,   S3,   T0
+    beq          I,    ZERO, .L_N4_M3
+.align 5
+.L_N4_M4:
+    GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
+    GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
+    GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
+    GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
+    GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
+    GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
+    PTR_ADDI     S1,   S1,   0x10
+    PTR_ADDI     S2,   S2,   0x10
+    PTR_ADDI     S3,   S3,   0x10
+    PTR_ADDI     S4,   S4,   0x10
+    PTR_ADDI     TD,   TD,   0x40
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N4_M4
+.L_N4_M3:
+    andi      I,     M,    0x03
+    beq       I,     ZERO, .L_N3
+.align 5
+.L_N4_M1:
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+    fld.s     F2,    S3,  0x00
+    fld.s     F3,    S4,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    fst.s       F2,    TD,  0x08
+    PTR_ADDI    S3,    S3,  0x04
+    fst.s       F3,    TD,  0x0C
+    PTR_ADDI    S4,    S4,  0x04
+
+    PTR_ADDI    TD,    TD,  0x10
+    PTR_ADDI    I,     I,   -1
+    blt         ZERO,  I,   .L_N4_M1
+.L_N3:
+    andi      J,     N,   0x03
+    beq       ZERO,  J,   .L_N0
+
+    andi      J,     N,   0x02
+    beq       ZERO,  J,   .L_N1
+.L_N2:
+    move         S1,   TS
+    PTR_ADD      S2,   TS,   TL
+    PTR_SRAI     I,    M,    0x01
+    PTR_ADD      TS,   S2,   TL
+    beq          I,    ZERO, .L_N2_M1
+.align 5
+.L_N2_M2:
+    GLD f, d, F0, S1, 0x00, F1, S2, 0x00
+    vilvl.w $vr0, $vr1, $vr0
+    GST v, , $vr0, TD, 0x00
+    PTR_ADDI     S1,   S1,   0x08
+    PTR_ADDI     S2,   S2,   0x08
+    PTR_ADDI     TD,   TD,   0x10
+
+    PTR_ADDI     I,    I,    -1
+    blt          ZERO, I,    .L_N2_M2
+.L_N2_M1:
+    andi      I,     M,    0x01
+    beq       I,     ZERO, .L_N1
+
+    fld.s     F0,    S1,  0x00
+    fld.s     F1,    S2,  0x00
+
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F1,    TD,  0x04
+    PTR_ADDI    S2,    S2,  0x04
+    PTR_ADDI    TD,    TD,  0x08
+.align 5
+.L_N1:
+    move      S1,    TS
+    beq       ZERO,  M,   .L_N0
+.L_N1_M1:
+    fld.s       F0,    S1,  0x00
+    PTR_ADDI    S1,    S1,  0x04
+    fst.s       F0,    TD,  0x00
+    PTR_ADDI    TD,    TD,  0x04
+    PTR_ADDI    M,     M,   -1
+    blt         ZERO,  M,   .L_N1_M1
+.L_N0:
+    pop_if_used 17, 20
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S
new file mode 100644
index 000000000..d9789bdcd
--- /dev/null
+++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S
@@ -0,0 +1,526 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define P5     $r27
+#define T0     $r28
+#define T1     $r29
+#define TL     $r7
+#define ZERO   $r0
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+// Loops outline
+//.L_M8 <-------------------
+//|   .L_N16:               |
+//|   .L_N15:               |
+//|   .L_N8:                |
+//|   .L_N7:                | Main Loop
+//|   .L_N4:                |
+//|   .L_N3:                |
+//|   .L_N2:                |
+//|   .L_N1:                |
+//|   .L_N0: ---------------
+//.L_M7
+//.L_M4
+//|   .L_M4_N16:
+//|   .L_M4_N15:
+//|   .L_M4_N8:
+//|   .L_M4_N7:
+//|   .L_M4_N4:
+//|   .L_M4_N3:
+//|   .L_M4_N2:
+//|   .L_M4_N1:
+//.L_M3
+//.L_M2
+//|   .L_M2_N16:
+//|   .L_M2_N15:
+//|   .L_M2_N8:
+//|   .L_M2_N7:
+//|   .L_M2_N4:
+//|   .L_M2_N3:
+//|   .L_M2_N2:
+//|   .L_M2_N1:
+//.L_M1
+//|   .L_M1_N16:
+//|   .L_M1_N15:
+//|   .L_M1_N8:
+//|   .L_M1_N7:
+//|   .L_M1_N4:
+//|   .L_M1_N3:
+//|   .L_M1_N2:
+//|   .L_M1_N1:
+//.L_M0
+
+    PROLOGUE
+    push_if_used 24, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    PTR_SRAI     T0,     N,     0x04
+    PTR_SRAI     T1,     N,     0x03
+    PTR_SLLI     T0,     T0,    0x04
+    PTR_SLLI     T1,     T1,    0x03
+
+    PTR_MUL      P2,     M,     T0
+    PTR_MUL      P3,     M,     T1
+    PTR_SLLI     P2,     P2,    0x02
+    PTR_SLLI     P3,     P3,    0x02
+    PTR_ADD      P2,     DST,   P2
+    PTR_ADD      P3,     DST,   P3
+
+    PTR_SRAI     T0,     N,     0x02
+    PTR_SRAI     T1,     N,     0x01
+    PTR_SLLI     T0,     T0,    0x02
+    PTR_SLLI     T1,     T1,    0x01
+    PTR_MUL      P4,     M,     T0
+    PTR_MUL      P5,     M,     T1
+    PTR_SLLI     P4,     P4,    0x02
+    PTR_SLLI     P5,     P5,    0x02
+    PTR_ADD      P4,     DST,   P4
+    PTR_ADD      P5,     DST,   P5
+
+    PTR_SLLI     TL,     LDA,   0x02
+    PTR_SRAI     J,      M,     0x03
+    PTR_SLLI     T0,     TL,    0x01
+    PTR_SLLI     T1,     M,     0x06
+    beq          ZERO,   J,     .L_M7
+.align 5
+.L_M8:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S5,     S3,    T0
+    PTR_ADD      S6,     S4,    T0
+    PTR_ADD      S7,     S5,    T0
+    PTR_ADD      S8,     S6,    T0
+    PTR_ADD      S0,     S7,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x200
+
+    PTR_SRAI     I,      N,     0x04
+    PTR_ADDI     J,      J,     -1
+    beq          ZERO,   I,     .L_N15
+.L_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    xvld       U0,     S5,    0x00
+    xvld       U1,     S5,    0x20
+    xvld       U2,     S6,    0x00
+    xvld       U3,     S6,    0x20
+
+    xvst       U0,     P1,    0x100
+    xvst       U1,     P1,    0x120
+    xvst       U2,     P1,    0x140
+    xvst       U3,     P1,    0x160
+
+    xvld       U4,     S7,    0x00
+    xvld       U5,     S7,    0x20
+    xvld       U6,     S8,    0x00
+    xvld       U7,     S8,    0x20
+
+    xvst       U4,     P1,    0x180
+    xvst       U5,     P1,    0x1A0
+    xvst       U6,     P1,    0x1C0
+    xvst       U7,     P1,    0x1E0
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     S2,     S2,    0x40
+    PTR_ADDI     S3,     S3,    0x40
+    PTR_ADDI     S4,     S4,    0x40
+    PTR_ADDI     S5,     S5,    0x40
+    PTR_ADDI     S6,     S6,    0x40
+    PTR_ADDI     S7,     S7,    0x40
+    PTR_ADDI     S8,     S8,    0x40
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_N16
+.L_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_N7
+.L_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \
+              U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+    PTR_ADDI     S5,     S5,    0x20
+    PTR_ADDI     S6,     S6,    0x20
+    PTR_ADDI     S7,     S7,    0x20
+    PTR_ADDI     S8,     S8,    0x20
+    PTR_ADDI     P2,     P2,    0x100
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+.L_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
+             $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
+    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \
+             $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     S5,     S5,    0x10
+    PTR_ADDI     S6,     S6,    0x10
+    PTR_ADDI     S7,     S7,    0x10
+    PTR_ADDI     S8,     S8,    0x10
+    PTR_ADDI     P3,     P3,    0x80
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+.L_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \
+              $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     S5,     S5,    0x08
+    PTR_ADDI     S6,     S6,    0x08
+    PTR_ADDI     S7,     S7,    0x08
+    PTR_ADDI     S8,     S8,    0x08
+    PTR_ADDI     P4,     P4,    0x40
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \
+              $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     S5,     S5,    0x04
+    PTR_ADDI     S6,     S6,    0x04
+    PTR_ADDI     S7,     S7,    0x04
+    PTR_ADDI     S8,     S8,    0x04
+    PTR_ADDI     P5,     P5,    0x20
+.L_N0:
+    blt        ZERO,   J,     .L_M8
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+.L_M4:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S0,     S3,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x100
+
+    PTR_SRAI     I,      N,     0x04
+    beq          ZERO,   I,     .L_M4_N15
+.align 5
+.L_M4_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    xvld       U4,     S3,    0x00
+    xvld       U5,     S3,    0x20
+    xvld       U6,     S4,    0x00
+    xvld       U7,     S4,    0x20
+
+    xvst       U4,     P1,    0x80
+    xvst       U5,     P1,    0xA0
+    xvst       U6,     P1,    0xC0
+    xvst       U7,     P1,    0xE0
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     S2,     S2,    0x40
+    PTR_ADDI     S3,     S3,    0x40
+    PTR_ADDI     S4,     S4,    0x40
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M4_N16
+.L_M4_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_M4_N7
+.L_M4_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+    PTR_ADDI     P2,     P2,    0x80
+.L_M4_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M4_N3
+.L_M4_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
+    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     P3,     P3,    0x40
+.L_M4_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M4_N1
+.L_M4_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     P4,     P4,    0x20
+.L_M4_N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     P5,     P5,    0x10
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+.L_M2:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S0,     S0,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x80
+
+    PTR_SRAI     I,      N,     0x04
+    beq          ZERO,   I,     .L_M2_N15
+.align 5
+.L_M2_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+    xvst       U2,     P1,    0x40
+    xvst       U3,     P1,    0x60
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     S2,     S2,    0x40
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M2_N16
+.L_M2_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_M2_N7
+.L_M2_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    GST xv, , U0, P2, 0x00, U1, P2, 0x20
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     P2,     P2,    0x40
+.L_M2_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M2_N3
+.L_M2_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
+    GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     P3,     P3,    0x20
+.L_M2_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M2_N1
+.L_M2_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, d, $f0, P4, 0x00, $f1, P4, 0x08
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     P4,     P4,    0x10
+.L_M2_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, s, $f0, P5, 0x00, $f1, P5, 0x04
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     P5,     P5,    0x08
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x40
+
+    PTR_SRAI     I,      N,     0x04
+    beq          ZERO,   I,     .L_M1_N15
+.align 5
+.L_M1_N16:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     P1,    0x00
+    xvst       U1,     P1,    0x20
+
+    PTR_ADDI     S1,     S1,    0x40
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M1_N16
+.L_M1_N15:
+    andi       I,      N,     0x08
+    beq        ZERO,   I,     .L_M1_N7
+.L_M1_N8:
+    xvld       U0,     S1,    0x00
+
+    GST xv, , U0, P2, 0x00
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     P2,     P2,    0x20
+.L_M1_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M1_N3
+.L_M1_N4:
+    GLD v, , $vr0, S1, 0x00
+    GST v, , $vr0, P3, 0x00
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     P3,     P3,    0x10
+.L_M1_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M1_N1
+.L_M1_N2:
+    GLD f, d, $f0, S1, 0x00
+    GST f, d, $f0, P4, 0x00
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     P4,     P4,    0x08
+.L_M1_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    GLD f, s, $f0, S1, 0x00
+    GST f, s, $f0, P5, 0x00
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     P5,     P5,    0x04
+.L_M0:
+    pop_if_used 24, 8
+    jirl       $r0,    $r1,   0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S
new file mode 100644
index 000000000..725a47a60
--- /dev/null
+++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S
@@ -0,0 +1,406 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/23 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*********************************************************************/
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S0     $r11
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define P0     $r20
+#define P1     $r23
+#define P2     $r24
+#define P3     $r25
+#define P4     $r26
+#define T0     $r27
+#define T1     $r28
+#define TL     $r7
+#undef  ZERO
+#define ZERO   $r0
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+
+// Loops outline
+//.L_M8 <-------------------
+//|   .L_N8:                |
+//|   .L_N7:                | Main Loop
+//|   .L_N4:                |
+//|   .L_N3:                |
+//|   .L_N2:                |
+//|   .L_N1:                |
+//|   .L_N0: ---------------
+//.L_M7
+//.L_M4
+//|   .L_M4_N8:
+//|   .L_M4_N7:
+//|   .L_M4_N4:
+//|   .L_M4_N3:
+//|   .L_M4_N2:
+//|   .L_M4_N1:
+//.L_M3
+//.L_M2
+//|   .L_M2_N8:
+//|   .L_M2_N7:
+//|   .L_M2_N4:
+//|   .L_M2_N3:
+//|   .L_M2_N2:
+//|   .L_M2_N1:
+//.L_M1
+//|   .L_M1_N8:
+//|   .L_M1_N7:
+//|   .L_M1_N4:
+//|   .L_M1_N3:
+//|   .L_M1_N2:
+//|   .L_M1_N1:
+//.L_M0
+
+    PROLOGUE
+    push_if_used 23, 8
+
+    move       S0,     SRC
+    move       P0,     DST
+
+    PTR_SRAI     T0,     N,     0x04
+    PTR_SRAI     T1,     N,     0x03
+    PTR_SLLI     T0,     T0,    0x04
+    PTR_SLLI     T1,     T1,    0x03
+
+    PTR_MUL      P2,     M,     T1
+    PTR_SLLI     P2,     P2,    0x02
+    PTR_ADD      P2,     DST,   P2
+    PTR_SRAI     T0,     N,     0x02
+    PTR_SRAI     T1,     N,     0x01
+    PTR_SLLI     T0,     T0,    0x02
+    PTR_SLLI     T1,     T1,    0x01
+    PTR_MUL      P3,     M,     T0
+    PTR_MUL      P4,     M,     T1
+    PTR_SLLI     P3,     P3,    0x02
+    PTR_SLLI     P4,     P4,    0x02
+    PTR_ADD      P3,     DST,   P3
+    PTR_ADD      P4,     DST,   P4
+
+    PTR_SLLI     TL,     LDA,   0x02
+    PTR_SRAI     J,      M,     0x03
+    PTR_SLLI     T0,     TL,    0x01
+    PTR_SLLI     T1,     M,     0x05
+    beq          ZERO,   J,     .L_M7
+.align 5
+.L_M8:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S5,     S3,    T0
+    PTR_ADD      S6,     S4,    T0
+    PTR_ADD      S7,     S5,    T0
+    PTR_ADD      S8,     S6,    T0
+    PTR_ADD      S0,     S7,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x100
+
+    PTR_SRAI     I,      N,     0x03
+    PTR_ADDI     J,      J,     -1
+    beq          ZERO,   I,     .L_N7
+.L_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+    xvld       U4,     S5,    0x00
+    xvld       U5,     S6,    0x00
+    xvld       U6,     S7,    0x00
+    xvld       U7,     S8,    0x00
+
+    GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \
+              U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+    PTR_ADDI     S5,     S5,    0x20
+    PTR_ADDI     S6,     S6,    0x20
+    PTR_ADDI     S7,     S7,    0x20
+    PTR_ADDI     S8,     S8,    0x20
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_N8
+.L_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N3
+.L_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
+             $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
+    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \
+             $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     S5,     S5,    0x10
+    PTR_ADDI     S6,     S6,    0x10
+    PTR_ADDI     S7,     S7,    0x10
+    PTR_ADDI     S8,     S8,    0x10
+    PTR_ADDI     P2,     P2,    0x80
+.L_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N1
+.L_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \
+              $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     S5,     S5,    0x08
+    PTR_ADDI     S6,     S6,    0x08
+    PTR_ADDI     S7,     S7,    0x08
+    PTR_ADDI     S8,     S8,    0x08
+    PTR_ADDI     P3,     P3,    0x40
+.L_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
+              $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
+    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \
+              $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     S5,     S5,    0x04
+    PTR_ADDI     S6,     S6,    0x04
+    PTR_ADDI     S7,     S7,    0x04
+    PTR_ADDI     S8,     S8,    0x04
+    PTR_ADDI     P4,     P4,    0x20
+.L_N0:
+    blt        ZERO,   J,     .L_M8
+
+.L_M7:
+    andi       J,      M,     0x04
+    beq        ZERO,   J,     .L_M3
+.L_M4:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S3,     S1,    T0
+    PTR_ADD      S4,     S2,    T0
+    PTR_ADD      S0,     S3,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x80
+
+    PTR_SRAI     I,      N,     0x03
+    beq          ZERO,   I,     .L_M4_N7
+.align 5
+.L_M4_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+    xvld       U2,     S3,    0x00
+    xvld       U3,     S4,    0x00
+
+    GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     S3,     S3,    0x20
+    PTR_ADDI     S4,     S4,    0x20
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M4_N8
+.L_M4_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M4_N3
+.L_M4_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
+    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     S3,     S3,    0x10
+    PTR_ADDI     S4,     S4,    0x10
+    PTR_ADDI     P2,     P2,    0x40
+.L_M4_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M4_N1
+.L_M4_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     S3,     S3,    0x08
+    PTR_ADDI     S4,     S4,    0x08
+    PTR_ADDI     P3,     P3,    0x20
+.L_M4_N1:
+    andi        I,      N,     0x01
+    beq         ZERO,   I,     .L_M3
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
+    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     S3,     S3,    0x04
+    PTR_ADDI     S4,     S4,    0x04
+    PTR_ADDI     P4,     P4,    0x10
+.L_M3:
+    andi       J,      M,     0x02
+    beq        ZERO,   J,     .L_M1
+.L_M2:
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+    PTR_ADD      S0,     S0,    T0
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x40
+
+    PTR_SRAI     I,      N,     0x03
+    beq          ZERO,   I,     .L_M2_N7
+.align 5
+.L_M2_N8:
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    GST xv, , U0, P1, 0x00, U1, P1, 0x20
+
+    PTR_ADDI     S1,     S1,    0x20
+    PTR_ADDI     S2,     S2,    0x20
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M2_N8
+.L_M2_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M2_N3
+.L_M2_N4:
+    GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
+    GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     S2,     S2,    0x10
+    PTR_ADDI     P2,     P2,    0x20
+.L_M2_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M2_N1
+.L_M2_N2:
+    GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, d, $f0, P3, 0x00, $f1, P3, 0x08
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     S2,     S2,    0x08
+    PTR_ADDI     P3,     P3,    0x10
+.L_M2_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M1
+
+    GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
+    GST f, s, $f0, P4, 0x00, $f1, P4, 0x04
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     S2,     S2,    0x04
+    PTR_ADDI     P4,     P4,    0x08
+.L_M1:
+    andi       J,      M,     0x01
+    beq        ZERO,   J,     .L_M0
+
+    move         S1,     S0
+    PTR_ADD      S2,     S0,    TL
+
+    move         P1,     P0
+    PTR_ADDI     P0,     P0,    0x20
+
+    PTR_SRAI     I,      N,     0x03
+    beq          ZERO,   I,     .L_M1_N7
+.align 5
+.L_M1_N8:
+    xvld       U0,     S1,    0x00
+
+    GST xv, , U0, P1, 0x00
+
+    PTR_ADDI     S1,     S1,    0x20
+
+    PTR_ADDI     I,      I,     -1
+    PTR_ADD      P1,     P1,    T1
+    blt          ZERO,   I,     .L_M1_N8
+.L_M1_N7:
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_M1_N3
+.L_M1_N4:
+    GLD v, , $vr0, S1, 0x00
+    GST v, , $vr0, P2, 0x00
+    PTR_ADDI     S1,     S1,    0x10
+    PTR_ADDI     P2,     P2,    0x10
+.L_M1_N3:
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_M1_N1
+.L_M1_N2:
+    GLD f, d, $f0, S1, 0x00
+    GST f, d, $f0, P3, 0x00
+    PTR_ADDI     S1,     S1,    0x08
+    PTR_ADDI     P3,     P3,    0x08
+.L_M1_N1:
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_M0
+
+    GLD f, s, $f0, S1, 0x00
+    GST f, s, $f0, P4, 0x00
+    PTR_ADDI     S1,     S1,    0x04
+    PTR_ADDI     P4,     P4,    0x04
+.L_M0:
+    pop_if_used 23, 8
+    jirl       $r0,    $r1,   0x00
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemv_n_8_lasx.S b/kernel/loongarch64/sgemv_n_8_lasx.S
new file mode 100644
index 000000000..52ffc320e
--- /dev/null
+++ b/kernel/loongarch64/sgemv_n_8_lasx.S
@@ -0,0 +1,463 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/30 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M4      $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define A0      $xr11
+#define A1      $xr12
+#define A2      $xr13
+#define A3      $xr14
+#define A4      $xr15
+#define A5      $xr16
+#define A6      $xr17
+#define A7      $xr18
+
+#define X0_F    $f2
+#define X1_F    $f3
+#define X2_F    $f4
+#define X3_F    $f5
+#define X4_F    $f6
+#define X5_F    $f7
+#define X6_F    $f8
+#define X7_F    $f9
+#define Y0_F    $f10
+#define A0_F    $f11
+#define A1_F    $f12
+#define A2_F    $f13
+#define A3_F    $f14
+#define A4_F    $f15
+#define A5_F    $f16
+#define A6_F    $f17
+#define A7_F    $f18
+
+.macro SLOAD_X_8
+    GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C, \
+                   X4, X, 0x10, X5, X, 0x14, X6, X, 0x18, X7, X, 0x1C
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro SLOAD_X_8_GAP
+    xvldrepl.w  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.w  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X3,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X4,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X5,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X6,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X7,     T0,     0x00
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
+                 X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
+.endm
+
+.macro SLOAD_X_4
+    GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04, X2, X, 0x08, X3, X, 0x0C
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro SLOAD_X_4_GAP
+    xvldrepl.w  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.w  X1,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X2,     T0,     0x00
+    PTR_ADD     T0,     T0,     INC_X
+    xvldrepl.w  X3,     T0,     0x00
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
+.endm
+
+.macro SLOAD_X_2
+    GLDREPL xv, w, X0, X, 0x00, X1, X, 0x04
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro SLOAD_X_2_GAP
+    xvldrepl.w  X0,     X,      0x00
+    PTR_ADD     T0,     X,      INC_X
+    xvldrepl.w  X1,     T0,     0x00
+    GMUL xvf, s, X0, X0, VALPHA, X1, X1, VALPHA
+.endm
+
+.macro SLOAD_X_1
+    GLDREPL xv, w, X0, X, 0x00
+    GMUL xvf, s, X0, X0, VALPHA
+.endm
+
+.macro SLOAD_Y_8
+    GLD xv, , Y0, Y, 0
+.endm
+
+.macro SLOAD_Y_8_GAP
+    fld.s   Y0_F,   Y,  0
+    fldx.s  A0_F,   Y,  INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    fld.s   A1_F,   T0, 0
+    fldx.s  A2_F,   T0, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  2
+    fld.s   A3_F,   T0, 0
+    fldx.s  A4_F,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    PTR_ADD   T0,   T0, INC_Y
+    fld.s   A5_F,   T0, 0
+    fldx.s  A6_F,   T0, INC_Y
+    GINSVE0 xv, w, Y0, A0, 1, Y0, A1, 2, Y0, A2, 3, Y0, A3, 4, \
+                   Y0, A4, 5, Y0, A5, 6, Y0, A6, 7
+.endm
+
+.macro SLOAD_Y_1
+    GLD f, s, Y0_F, Y, 0
+.endm
+
+.macro SGEMV_N_8x8
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0, \
+            A2, PA2, 0, A3, PA3, 0, \
+            A4, PA4, 0, A5, PA5, 0, \
+            A6, PA6, 0, A7, PA7, 0
+    GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \
+                  Y0, A2, X2, Y0, Y0, A3, X3, Y0, \
+                  Y0, A4, X4, Y0, Y0, A5, X5, Y0, \
+                  Y0, A6, X6, Y0, Y0, A7, X7, Y0
+.endm
+
+.macro SGEMV_N_1x8
+    GLD_INC f, s, 0x04,                 \
+            A0_F, PA0, 0, A1_F, PA1, 0, \
+            A2_F, PA2, 0, A3_F, PA3, 0, \
+            A4_F, PA4, 0, A5_F, PA5, 0, \
+            A6_F, PA6, 0, A7_F, PA7, 0
+    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \
+                Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F, \
+                Y0_F, A4_F, X4_F, Y0_F, Y0_F, A5_F, X5_F, Y0_F, \
+                Y0_F, A6_F, X6_F, Y0_F, Y0_F, A7_F, X7_F, Y0_F
+.endm
+
+.macro SGEMV_N_8x4
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0, \
+            A2, PA2, 0, A3, PA3, 0
+    GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0, \
+                  Y0, A2, X2, Y0, Y0, A3, X3, Y0
+.endm
+
+.macro SGEMV_N_1x4
+    GLD_INC f, s, 0x04,                 \
+            A0_F, PA0, 0, A1_F, PA1, 0, \
+            A2_F, PA2, 0, A3_F, PA3, 0
+    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F, \
+                Y0_F, A2_F, X2_F, Y0_F, Y0_F, A3_F, X3_F, Y0_F
+.endm
+
+.macro SGEMV_N_8x2
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0
+    GMADD xvf, s, Y0, A0, X0, Y0, Y0, A1, X1, Y0
+.endm
+
+.macro SGEMV_N_1x2
+    GLD_INC f, s, 0x04,             \
+            A0_F, PA0, 0, A1_F, PA1, 0
+    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F, Y0_F, A1_F, X1_F, Y0_F
+.endm
+
+.macro SGEMV_N_1x1
+    GLD_INC f, s, 0x04, A0_F, PA0, 0
+    GMADD f, s, Y0_F, A0_F, X0_F, Y0_F
+.endm
+
+.macro SSTORE_Y_8
+    GST xv, , Y0, Y, 0
+.endm
+
+.macro SSTORE_Y_8_GAP
+    xvstelm.w   Y0,     Y,      0,      0
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.w   Y0,     T0,     0,      1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      2
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      3
+
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      4
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      5
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      6
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.w   Y0,     T0,     0,      7
+.endm
+
+.macro SSTORE_Y_1
+    GST f, s, Y0_F, Y, 0
+.endm
+
+.macro SGEMV_N_LASX XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M4
+.L_\XW\()_N_L8:
+    SLOAD_\X_8
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    SLOAD_\Y_8
+    SGEMV_N_8x8
+    SSTORE_\Y_8
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    PTR_ADDI    K,      K,      8
+    bnez        I,      .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    SLOAD_\Y_1
+    SGEMV_N_1x8
+    SSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    SLOAD_\X_4
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    SLOAD_\Y_8
+    SGEMV_N_8x4
+    SSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    SLOAD_\Y_1
+    SGEMV_N_1x4
+    SSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M4
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    SLOAD_\X_2
+    xor         K,      K,      K
+    move        Y,      Y_ORG
+    PTR_SRLI    I,      M,       3
+    beqz    I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    SLOAD_\Y_8
+    SGEMV_N_8x2
+    SSTORE_\Y_8
+    PTR_ADDI  I,      I,      -1
+    PTR_ADDI  K,      K,      8
+    PTR_ALSL  Y,      INC_Y,  Y,  3
+    bnez    I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    andi    I,      M,      7
+    beqz    I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    SLOAD_\Y_1
+    SGEMV_N_1x2
+    SSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M4
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     PA1,    PA1,    K_LDA
+    PTR_ALSL    X,      INC_X,  X,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    SLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    SLOAD_\Y_1
+    SGEMV_N_1x1
+    SSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 19
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2
+    xvreplve0.w     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    SGEMV_N_LASX GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    SGEMV_N_LASX GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    SGEMV_N_LASX GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    SGEMV_N_LASX GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 19
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemv_n_lsx.S b/kernel/loongarch64/sgemv_n_lsx.S
new file mode 100644
index 000000000..3b1b519fc
--- /dev/null
+++ b/kernel/loongarch64/sgemv_n_lsx.S
@@ -0,0 +1,227 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Param */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define ALPHA  $f0
+
+#define YORIG  $r18
+#define T0     $r19
+#define T1     $r20
+#define XX     $r12
+#define YY     $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define IX     $r25
+#define IY     $r26
+#define II     $r27
+#define T2     $r28
+#define T3     $r29
+#define T4     $r30
+
+/* LSX vectors */
+#define U0     $vr11
+#define U1     $vr12
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define VALPHA $vr10
+
+#define a1     $f3
+#define a2     $f4
+#define a3     $f5
+#define a4     $f6
+#define a5     $f7
+#define a6     $f8
+#define a7     $f9
+#define a8     $f10
+
+
+    PROLOGUE
+
+    LDARG  INCY,    $sp,  0
+    LDARG  BUFFER,  $sp,  8
+
+    addi.d  $sp, $sp, -80
+
+    SDARG  $r23,  $sp,    0
+    SDARG  $r24,  $sp,    8
+    SDARG  $r25,  $sp,    16
+    SDARG  $r26,  $sp,    32
+    SDARG  $r27,  $sp,    40
+    SDARG  $r28,  $sp,    48
+    SDARG  $r29,  $sp,    56
+    SDARG  $r30,  $sp,    64
+    ST     ALPHA, $sp,    72
+
+    vldrepl.w  VALPHA, $sp, 72
+
+    slli.d     LDA,     LDA,  BASE_SHIFT
+    slli.d     INCX,    INCX, BASE_SHIFT
+    slli.d     INCY,    INCY, BASE_SHIFT
+
+    bge    $r0,    M,    .L999
+    bge    $r0,    N,    .L999
+
+    move   J,      $r0
+    move   IX,     $r0
+
+    move   AO1,    A  //a_ptr
+    move   XX,     X
+    move   YY,     Y
+
+    beq    J,      M,     .L999
+
+.L01:
+    vldx        U0,    XX,    IX
+    vpermi.w    U0,    U0,    0x00
+
+    vfmul.s     U1,    VALPHA,  U0  //temp1
+
+    move        IY,    $r0
+    move        II,    $r0
+    move        I,     $r0
+
+    srai.d      T0,    M,     2  //n/4
+    beq         I,     T0,    .L03
+
+.L02:
+    vldx        U2,    AO1,    II
+
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+    add.d       T3,    T2,    INCY
+    add.d       T4,    T3,    INCY
+
+    fldx.s      a1,    YY,    T1
+    fldx.s      a2,    YY,    T2
+    fldx.s      a3,    YY,    T3
+    fldx.s      a4,    YY,    T4
+
+    vextrins.w  U3,    U4,    0x10
+    vextrins.w  U3,    U5,    0x20
+    vextrins.w  U3,    U6,    0x30
+
+    vfmadd.s    U3,    U1,    U2,    U3
+
+    vextrins.w  U4,    U3,    0x01
+    vextrins.w  U5,    U3,    0x02
+    vextrins.w  U6,    U3,    0x03
+
+    fstx.s     a1,     YY,    T1
+    fstx.s     a2,     YY,    T2
+    fstx.s     a3,     YY,    T3
+    fstx.s     a4,     YY,    T4
+
+    add.d      IY,     T4,    INCY
+    addi.d     II,     II,    16
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L02
+
+.L03:
+    andi       T0,     M,     2
+    beq        $r0,    T0,    .L04
+
+    addi.d     T1,     $r0,   4
+    mod.d      T1,     M,     T1
+    sub.d      II,     M,     T1
+    slli.d     II,     II,    BASE_SHIFT
+
+    move       T1,    IY
+    add.d      T2,    T1,    INCY
+
+    fldx.s     a1,     AO1,   II
+    addi.d     T0,     II,    4
+    fldx.s     a2,     AO1,   T0
+
+    fldx.s     a3,    YY,    T1
+    fldx.s     a4,    YY,    T2
+
+    fmadd.s    a3,    $f12,  a1,    a3
+    fmadd.s    a4,    $f12,  a2,    a4
+
+    fstx.s     a3,     YY,    T1
+    fstx.s     a4,     YY,    T2
+
+    add.d      IY,     T2,    INCY
+
+.L04:
+    andi       T0,     M,     1
+    beq        $r0,    T0,    .L05
+
+    addi.d     II,    M,     -1
+    slli.d     II,    II,    BASE_SHIFT
+
+    fldx.s     a1,    AO1,   II
+    fldx.s     a3,    YY,    IY
+
+    fmadd.s    a3,    $f12,  a1,    a3
+
+    fstx.s     a3,    YY,    IY
+
+    add.d      IY,    IY,    INCY
+
+.L05:
+    add.d      AO1,   AO1,   LDA
+    add.d      IX,    IX,    INCX
+
+    addi.d     J,     J,     1
+    blt        J,     N,    .L01
+
+.L999:
+    LDARG  $r23,  $sp,    0
+    LDARG  $r24,  $sp,    8
+    LDARG  $r25,  $sp,    16
+    LDARG  $r26,  $sp,    32
+    LDARG  $r27,  $sp,    40
+    LDARG  $r28,  $sp,    48
+    LDARG  $r29,  $sp,    56
+    LDARG  $r30,  $sp,    64
+    LD     ALPHA, $sp,    72
+    addi.d  $sp, $sp, 80
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemv_t_8_lasx.S b/kernel/loongarch64/sgemv_t_8_lasx.S
new file mode 100644
index 000000000..f4bfffb42
--- /dev/null
+++ b/kernel/loongarch64/sgemv_t_8_lasx.S
@@ -0,0 +1,405 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2023/08/30 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA   $f0
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M4      $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define A0      $xr2
+#define A1      $xr3
+#define A2      $xr4
+#define A3      $xr5
+#define A4      $xr6
+#define A5      $xr7
+#define A6      $xr8
+#define A7      $xr9
+#define TP0     $xr10
+#define TP1     $xr11
+#define TP2     $xr12
+#define TP3     $xr13
+#define TP4     $xr14
+#define TP5     $xr15
+#define TP6     $xr16
+#define TP7     $xr17
+#define Y0      $xr2
+#define Y1      $xr3
+#define Y2      $xr4
+#define Y3      $xr5
+#define Y4      $xr6
+#define Y5      $xr7
+#define Y6      $xr8
+#define Y7      $xr9
+
+.macro ZERO_Y8
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
+                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
+.endm
+
+.macro ZERO_Y4
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y2
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro SLOAD_X8
+    GLD xv, , X0, X, 0x00
+.endm
+
+.macro SLOAD_X8_GAP
+    fld.s       $f1,    X,    0x00
+    fldx.s      $f2,    X,    INC_X
+    PTR_ALSL    T0,     INC_X,      X,      1
+    fld.s       $f3,    T0,   0x00
+    fldx.s      $f4,    T0,   INC_X
+    GINSVE0 xv, w, X0, A0, 1, X0, A1, 2, X0, A2, 3
+    PTR_ALSL    T0,     INC_X,      X,      2
+    fld.s       $f2,    T0,   0x00
+    fldx.s      $f3,    T0,   INC_X
+    PTR_ALSL    T0,     INC_X,      T0,     1
+    fld.s       $f4,    T0,   0x00
+    fldx.s      $f5,    T0,   INC_X
+    GINSVE0 xv, w, X0, A0, 4, X0, A1, 5, X0, A2, 6, X0, A3, 7
+.endm
+
+.macro SGEMV_T_8x8
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0, \
+            A2, PA2, 0, A3, PA3, 0, \
+            A4, PA4, 0, A5, PA5, 0, \
+            A6, PA6, 0, A7, PA7, 0
+    GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \
+                  TP2, A2, X0, TP2, TP3, A3, X0, TP3, \
+                  TP4, A4, X0, TP4, TP5, A5, X0, TP5, \
+                  TP6, A6, X0, TP6, TP7, A7, X0, TP7
+.endm
+
+.macro SGEMV_T_4x8
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0, \
+            A2, PA2, 0, A3, PA3, 0
+    GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1, \
+                  TP2, A2, X0, TP2, TP3, A3, X0, TP3
+.endm
+
+.macro SGEMV_T_2x8
+    GLD_INC xv, , 0x20,             \
+            A0, PA0, 0, A1, PA1, 0
+    GMADD xvf, s, TP0, A0, X0, TP0, TP1, A1, X0, TP1
+.endm
+
+.macro SGEMV_T_LASX XW:req X8:req, X4:req
+    PTR_SRLI  J,      N,      3
+    beqz      J,      .L_\XW\()_N_7
+    PTR_SLLI  K_LDA,  LDA,    3
+    PTR_SUB   K_LDA,  K_LDA,  M4
+.L_\XW\()_N_L8:
+    ZERO_Y8
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       3
+    beqz      I,      .L_\XW\()_M_7
+.align 5
+.L_\XW\()_M_L8:
+    SLOAD_\X8
+    SGEMV_T_8x8
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  3
+    bnez        I,  .L_\XW\()_M_L8
+.L_\XW\()_M_7:
+    // Accumulated
+    GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
+                 Y5, TP5, Y6, TP6, Y7, TP7
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    fld.s   $f1,    X,      0x00
+    fld.s   $f10,   PA0,    0x00
+    fld.s   $f11,   PA1,    0x00
+    fld.s   $f12,   PA2,    0x00
+    fld.s   $f13,   PA3,    0x00
+    fld.s   $f14,   PA4,    0x00
+    fld.s   $f15,   PA5,    0x00
+    fld.s   $f16,   PA6,    0x00
+    fld.s   $f17,   PA7,    0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \
+               PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \
+               PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04
+#else
+    GADDI , d, PA0, PA0, 0x04, PA1, PA1, 0x04, PA2, PA2, 0x04, PA3, PA3, 0x04, \
+               PA4, PA4, 0x04, PA5, PA5, 0x04, PA6, PA6, 0x04, PA7, PA7, 0x04
+#endif
+    GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, \
+                $f6, $f14, $f1, $f6, $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9,
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    fld.s   $f10,   Y,  0x00
+    fldx.s  $f11,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.s   $f12,   PY0,    0x00
+    fldx.s  $f13,   PY0,    INC_Y
+    PTR_ALSL    PY1, INC_Y,  Y,  2
+    fld.s   $f14,   PY1,    0x00
+    fldx.s  $f15,   PY1,    INC_Y
+    PTR_ALSL    PY2, INC_Y,  PY1, 1
+    fld.s   $f16,   PY2,    0x00
+    fldx.s  $f17,   PY2,    INC_Y
+
+    GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, \
+                $f14, ALPHA, $f6, $f14, $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
+              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
+#endif
+    fst.s   $f10,   Y,      0x00
+    fstx.s  $f11,   Y,      INC_Y
+    fst.s   $f12,   PY0,    0x00
+    fstx.s  $f13,   PY0,    INC_Y
+    fst.s   $f14,   PY1,    0x00
+    fstx.s  $f15,   PY1,    INC_Y
+    fst.s   $f16,   PY2,    0x00
+    fstx.s  $f17,   PY2,    INC_Y
+
+    PTR_ALSL    Y,      INC_Y,  Y,  3
+    bnez        J,      .L_\XW\()_N_L8
+.L_\XW\()_N_7:
+    andi        J,      N,      4
+    beqz        J,      .L_\XW\()_N_3
+    ZERO_Y4
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_4_M_7
+.align 5
+.L_\XW\()_N_4_M_L8:
+    SLOAD_\X8
+    SGEMV_T_4x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_4_M_L8
+.L_\XW\()_N_4_M_7:
+    // Accumulated
+    GACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_N_4_M_END
+.align 5
+.L_\XW\()_N_4_M_L1:
+    fld.s   $f1,    X,      0x00
+    GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00, $f12, PA2, 0x00, $f13, PA3, 0x00
+    GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_4_M_L1
+.L_\XW\()_N_4_M_END:
+    fld.s   $f10,   Y,  0x00
+    fldx.s  $f11,   Y,  INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    fld.s   $f12,   PY0,    0x00
+    fldx.s  $f13,   PY0,    INC_Y
+
+    GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13
+
+    PTR_SLLI    K_LDA,  LDA,    2
+    PTR_SUB     K_LDA,  K_LDA,  M4
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    fst.s   $f10,   Y,      0x00
+    fstx.s  $f11,   Y,      INC_Y
+    fst.s   $f12,   PY0,    0x00
+    fstx.s  $f13,   PY0,    INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+.L_\XW\()_N_3:
+    andi        J,      N,      2
+    beqz        J,      .L_\XW\()_N_1
+    ZERO_Y2
+    move        X,      X_ORG
+    PTR_SRLI    I,      M,       3
+    beqz        I,      .L_\XW\()_N_2_M_7
+.align 5
+.L_\XW\()_N_2_M_L8:
+    SLOAD_\X8
+    SGEMV_T_2x8
+    PTR_ADDI  I,      I,      -1
+    PTR_ALSL  X,      INC_X,  X,  3
+    bnez      I,      .L_\XW\()_N_2_M_L8
+.L_\XW\()_N_2_M_7:
+    // Accumulated
+    GACC xvf, s, Y0, TP0, Y1, TP1
+    andi        I,      M,      7
+    beqz        I,      .L_\XW\()_N_2_M_END
+.align 5
+.L_\XW\()_N_2_M_L1:
+    fld.s   $f1,    X,      0x00
+    GLD_INC f, s, 0x04, $f10, PA0, 0x00, $f11, PA1, 0x00
+    GMADD f, s, $f2, $f10, $f1, $f2, $f3, $f11, $f1, $f3
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_N_2_M_L1
+.L_\XW\()_N_2_M_END:
+    fld.s   $f10,   Y,  0x00
+    fldx.s  $f11,   Y,  INC_Y
+
+    GMADD f, s, $f10, ALPHA, $f2, $f10, $f11, ALPHA, $f3, $f11
+
+    PTR_SLLI    K_LDA,  LDA,    1
+    PTR_SUB     K_LDA,  K_LDA,  M4
+
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    fst.s   $f10,   Y,      0x00
+    fstx.s  $f11,   Y,      INC_Y
+    PTR_ALSL        Y,      INC_Y,  Y,  1
+.L_\XW\()_N_1:
+    andi    J,      N,      1
+    beqz    J,      .L_END
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    fld.s   $f2,    PA0,    0x00
+    fld.s   $f1,    X,      0x00
+    fmadd.s $f10,   $f2,    $f1,    $f10
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x04
+    bnez      I,      .L_\XW\()_N_1_M_L1
+
+    fld.s     $f2,    Y,      0x00
+    fmadd.s   $f2,    ALPHA,  $f10,  $f2
+    fst.s     $f2,    Y,      0x00
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 18
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 2, INC_X, INC_X, 2, INC_Y, INC_Y, 2, M4, M, 2
+    xvreplve0.w     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
+              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    SGEMV_T_LASX GAP_0, X8, X4
+.L_GAP_1: /* if (incx != 1) */
+    SGEMV_T_LASX GAP_1, X8_GAP, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 18
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/sgemv_t_lsx.S b/kernel/loongarch64/sgemv_t_lsx.S
new file mode 100644
index 000000000..b817f0bbf
--- /dev/null
+++ b/kernel/loongarch64/sgemv_t_lsx.S
@@ -0,0 +1,275 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Param */
+#define M      $r4
+#define N      $r5
+#define A      $r7
+#define LDA    $r8
+#define X      $r9
+#define INCX   $r10
+#define Y      $r11
+#define INCY   $r6
+#define BUFFER $r16
+#define ALPHA  $f0
+
+#define YORIG  $r18
+#define T0     $r19
+#define T1     $r20
+#define AO3    $r12
+#define AO4    $r13
+#define I      $r14
+#define J      $r15
+#define AO1    $r23
+#define AO2    $r24
+#define IX     $r25
+#define IY     $r26
+#define II     $r27
+#define T2     $r28
+#define T3     $r29
+#define T4     $r30
+
+/* LSX vectors */
+#define U0     $vr11
+#define U1     $vr12
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define VALPHA $vr10
+
+#define a1     $f3
+#define a2     $f4
+#define a3     $f5
+#define a4     $f6
+#define a5     $f7
+#define a6     $f8
+#define a7     $f9
+#define a8     $f10
+
+
+    PROLOGUE
+
+    LDARG  INCY,    $sp,  0
+    LDARG  BUFFER,  $sp,  8
+
+    addi.d  $sp, $sp, -80
+
+    SDARG  $r23,  $sp,    0
+    SDARG  $r24,  $sp,    8
+    SDARG  $r25,  $sp,    16
+    SDARG  $r26,  $sp,    32
+    SDARG  $r27,  $sp,    40
+    SDARG  $r28,  $sp,    48
+    SDARG  $r29,  $sp,    56
+    SDARG  $r30,  $sp,    64
+    ST     ALPHA, $sp,    72
+
+    vldrepl.w  VALPHA, $sp, 72
+
+    slli.d     LDA,     LDA,  BASE_SHIFT
+    slli.d     INCX,    INCX, BASE_SHIFT
+    slli.d     INCY,    INCY, BASE_SHIFT
+
+    bge    $r0,    M,    .L999
+    bge    $r0,    N,    .L999
+
+    move   J,      $r0
+    move   IY,     $r0
+
+    move   AO1,    A  //a_ptr1
+
+    srai.d     T0,     N,     2  //n/4
+    beq        J,      T0,    .L04
+
+.L01: /* j<n/4 */
+    vxor.v    U0,     U0,   U0
+    add.d    AO2,    AO1,  LDA
+    add.d    AO3,    AO2,  LDA
+    add.d    AO4,    AO3,  LDA
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,     $r0
+
+    beq       $r0,    M,    .L03
+
+.L02:  /* i<m */
+    vldx        U1,    X,    IX
+    fldx.s      $f2,   AO1,   II
+    fldx.s      $f3,   AO2,   II
+    fldx.s      $f4,   AO3,   II
+    fldx.s      $f5,   AO4,   II
+
+    vpermi.w    U1,    U1,    0x00
+    vextrins.w  U2,    U3,    0x10
+    vextrins.w  U2,    U4,    0x20
+    vextrins.w  U2,    U5,    0x30
+
+    vfmadd.s    U0,    U2,    U1,    U0  //temp1,2,3,4
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    4
+    addi.d     I,      I,     1
+    blt        I,      M,    .L02
+
+.L03:
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+    add.d       T3,    T2,    INCY
+    add.d       T4,    T3,    INCY
+
+    fldx.s      a1,    Y,    T1
+    fldx.s      a2,    Y,    T2
+    fldx.s      a3,    Y,    T3
+    fldx.s      a4,    Y,    T4
+
+    vextrins.w  U3,    U4,    0x10
+    vextrins.w  U3,    U5,    0x20
+    vextrins.w  U3,    U6,    0x30
+
+    vfmadd.s    U3,    VALPHA,    U0,    U3
+
+    vextrins.w  U4,    U3,    0x01
+    vextrins.w  U5,    U3,    0x02
+    vextrins.w  U6,    U3,    0x03
+
+    fstx.s     a1,     Y,    T1
+    fstx.s     a2,     Y,    T2
+    fstx.s     a3,     Y,    T3
+    fstx.s     a4,     Y,    T4
+
+    slli.d     T1,     LDA,     2
+    add.d      AO1,    AO1,     T1
+    add.d      IY,     T4,      INCY
+
+    addi.d     J,      J,     1
+    blt        J,      T0,    .L01
+
+.L04: /* if(n&2) */
+    andi       T0,     N,     2
+    beq        $r0,    T0,    .L07
+
+    MTC       a1,     $r0
+    MTC       a2,     $r0
+
+    add.d    AO2,    AO1,  LDA
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,     $r0
+
+    beq       $r0,    M,    .L06
+
+.L05:  /* i<m */
+    fldx.s      a3,    X,     IX
+    fldx.s      a4,    AO1,   II
+    fldx.s      a5,    AO2,   II
+
+    fmadd.s     a1,    a4,    a3,    a1  //temp1
+    fmadd.s     a2,    a5,    a3,    a2  //temp2
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    4
+    addi.d     I,      I,     1
+    blt        I,      M,    .L05
+
+.L06:
+    move        T1,    IY
+    add.d       T2,    T1,    INCY
+
+    fldx.s      a3,    Y,    T1
+    fldx.s      a4,    Y,    T2
+
+    fmadd.s    a3,    ALPHA,    a1,    a3
+    fmadd.s    a4,    ALPHA,    a2,    a4
+
+    fstx.s     a3,     Y,    T1
+    fstx.s     a4,     Y,    T2
+
+    slli.d     T0,     LDA,     1
+    add.d      AO1,    AO1,     T0
+    add.d      IY,     T2,      INCY
+
+.L07: /* if(n&1) */
+    andi       T0,     N,     1
+    beq        $r0,    T0,    .L999
+
+    MTC       a1,     $r0
+
+    move      IX,     $r0
+    move      I,      $r0
+    move      II,      $r0
+
+    beq       $r0,    M,    .L09
+
+.L08:  /* i<m */
+    fldx.s      a3,    X,     IX
+    fldx.s      a4,    AO1,   II
+
+    fmadd.s     a1,    a4,    a3,    a1  //temp1
+
+    add.d       IX,    IX,    INCX
+
+    addi.d     II,     II,    4
+    addi.d     I,      I,     1
+    blt        I,      M,    .L08
+
+.L09:
+    fldx.s      a3,    Y,    IY
+
+    fmadd.s    a3,    ALPHA,    a1,    a3
+
+    fstx.s     a3,     Y,    IY
+
+    add.d      AO1,    AO1,     LDA
+    add.d      IY,     IY,      INCY
+
+.L999:
+    LDARG  $r23,  $sp,    0
+    LDARG  $r24,  $sp,    8
+    LDARG  $r25,  $sp,    16
+    LDARG  $r26,  $sp,    32
+    LDARG  $r27,  $sp,    40
+    LDARG  $r28,  $sp,    48
+    LDARG  $r29,  $sp,    56
+    LDARG  $r30,  $sp,    64
+    LD     ALPHA, $sp,    72
+    addi.d  $sp, $sp, 80
+    jirl    $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S
index 57c21a017..8c5c91ade 100644
--- a/kernel/loongarch64/snrm2.S
+++ b/kernel/loongarch64/snrm2.S
@@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    fmov.d s2, s1
    bge $r0,    N, .L999
    slli.d INCX, INCX, BASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    srai.d I, N, 3
    bne INCX, TEMP, .L20
    bge $r0,    I, .L15
diff --git a/kernel/loongarch64/snrm2_lasx.S b/kernel/loongarch64/snrm2_lasx.S
new file mode 100644
index 000000000..3ae11e897
--- /dev/null
+++ b/kernel/loongarch64/snrm2_lasx.S
@@ -0,0 +1,153 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+
+/* Don't change following FR unless you know the effects. */
+#define VX0    $xr15
+#define VX1    $xr16
+#define VX2    $xr17
+#define VX3    $xr18
+#define VX4    $xr21
+#define res1   $xr19
+#define res2   $xr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+
+   xvxor.v res1, res1, res1
+   xvxor.v res2, res2, res2
+   bge $r0, N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, BASE_SHIFT
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge $r0,  I, .L997
+   .align 3
+
+.L10:
+   xvld VX0, X, 0
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   blt $r0, I, .L10
+   .align 3
+   b .L996
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
+   add.d X, X, INCX
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   add.d X, X, INCX
+   xvinsgr2vr.w VX0, t1, 0
+   xvinsgr2vr.w VX0, t2, 1
+   xvinsgr2vr.w VX0, t3, 2
+   xvinsgr2vr.w VX0, t4, 3
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
+   add.d X, X, INCX
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   xvinsgr2vr.w VX0, t1, 4
+   xvinsgr2vr.w VX0, t2, 5
+   xvinsgr2vr.w VX0, t3, 6
+   xvinsgr2vr.w VX0, t4, 7
+   add.d X, X, INCX
+   xvfcvtl.d.s VX1, VX0
+   xvfcvth.d.s VX2, VX0
+   xvfmadd.d res1, VX1, VX1, res1
+   xvfmadd.d res2, VX2, VX2, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+
+.L996:
+   xvfadd.d res1, res1, res2
+   xvpickve.d VX1, res1, 1
+   xvpickve.d VX2, res1, 2
+   xvpickve.d VX3, res1, 3
+   fadd.d   $f19, $f19, $f16
+   fadd.d   $f19, $f19, $f17
+   fadd.d   $f19, $f19, $f18
+   .align 3
+
+.L997:
+   andi I, N, 7
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+   fld.s $f15, X, 0
+   add.d    X, X, INCX
+   addi.d   I, I, -1
+   fcvt.d.s $f15, $f15
+   fmadd.d  $f19, $f15, $f15, $f19
+   blt $r0, I, .L998
+   .align 3
+
+.L999:
+   fsqrt.d $f19, $f19
+   move $r4, $r17
+   fcvt.s.d $f0, $f19
+   jirl $r0, $r1, 0x0
+
+   EPILOGUE
diff --git a/kernel/loongarch64/snrm2_lsx.S b/kernel/loongarch64/snrm2_lsx.S
new file mode 100644
index 000000000..bb492dbf0
--- /dev/null
+++ b/kernel/loongarch64/snrm2_lsx.S
@@ -0,0 +1,159 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define VX0    $vr15
+#define VX1    $vr16
+#define VX2    $vr17
+#define VX3    $vr18
+#define VX4    $vr21
+#define VX5    $vr22
+/* Don't change following FR unless you know the effects. */
+#define res1   $vr19
+#define res2   $vr20
+
+   PROLOGUE
+
+#ifdef F_INTERFACE
+   LDINT   N,     0(N)
+   LDINT   INCX,  0(INCX)
+#endif
+   vxor.v res1, res1, res1
+   vxor.v res2, res2, res2
+   bge $r0,    N, .L999
+   beq $r0, INCX, .L999
+   li.d  TEMP, SIZE
+   slli.d INCX, INCX, BASE_SHIFT
+   srai.d I, N, 3
+   bne INCX, TEMP, .L20
+   bge  $r0,    I, .L997
+   .align 3
+
+.L10:
+   vld  VX0, X, 0
+   vld  VX5, X, 4 * SIZE
+   addi.d I, I, -1
+   addi.d X, X, 8 * SIZE
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfcvtl.d.s VX3, VX5
+   vfcvth.d.s VX4, VX5
+   vfmadd.d  res1, VX1, VX1, res1
+   vfmadd.d  res2, VX2, VX2, res2
+   vfmadd.d  res1, VX3, VX3, res1
+   vfmadd.d  res2, VX4, VX4, res2
+   blt $r0, I, .L10
+   b .L996
+   .align 3
+
+.L20:
+   bge $r0, I, .L997
+   .align 3
+
+.L21:
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
+   add.d X, X, INCX
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   add.d X, X, INCX
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   vfcvtl.d.s VX1, VX0
+   vfcvth.d.s VX2, VX0
+   vfmadd.d res1, VX1, VX1, res1
+   vfmadd.d res2, VX2, VX2, res2
+   ld.w t1, X, 0
+   add.d X, X, INCX
+   ld.w t2, X, 0
+   add.d X, X, INCX
+   ld.w t3, X, 0
+   add.d X, X, INCX
+   ld.w t4, X, 0
+   add.d X, X, INCX
+   vinsgr2vr.w VX0, t1, 0
+   vinsgr2vr.w VX0, t2, 1
+   vinsgr2vr.w VX0, t3, 2
+   vinsgr2vr.w VX0, t4, 3
+   vfcvtl.d.s VX3, VX0
+   vfcvth.d.s VX4, VX0
+   vfmadd.d res1, VX3, VX3, res1
+   vfmadd.d res2, VX4, VX4, res2
+   addi.d  I, I, -1
+   blt $r0, I, .L21
+   b .L996
+   .align 3
+
+.L996:
+   vfadd.d res1, res1, res2
+   vreplvei.d VX1, res1, 1
+   vfadd.d res1, VX1, res1
+   .align 3
+
+.L997:
+   andi I, N, 7
+   bge $r0, I, .L999
+   .align 3
+
+.L998:
+   fld.s $f15, X, 0
+   addi.d I, I, -1
+   fcvt.d.s $f15, $f15
+   fmadd.d $f19, $f15, $f15, $f19
+   add.d X, X, INCX
+   blt $r0, I, .L998
+   .align 3
+
+.L999:
+   fsqrt.d $f19, $f19
+   move $r4, $r17
+   fcvt.s.d $f0, $f19
+   jirl $r0, $r1, 0x0
+   .align 3
+
+   EPILOGUE
diff --git a/kernel/loongarch64/sum_lasx.S b/kernel/loongarch64/sum_lasx.S
new file mode 100644
index 000000000..fd6d5adb3
--- /dev/null
+++ b/kernel/loongarch64/sum_lasx.S
@@ -0,0 +1,225 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    xvld VX0, X, 0
+    xvfadd.s res1, res1, VX0
+#ifdef DOUBLE
+    xvld VX1, X, 32
+    xvfadd.s res1, res1, VX1
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/sum_lsx.S b/kernel/loongarch64/sum_lsx.S
new file mode 100644
index 000000000..6b2027781
--- /dev/null
+++ b/kernel/loongarch64/sum_lsx.S
@@ -0,0 +1,204 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    VFADD res2, VX0, VX1
+    VFADD res1, res1, res2
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    VFADD res2, VX0, VX1
+    VFADD res1, res1, res2
+#endif
+    addi.d  X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d   res1, res1, VX1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   $f12, X, 0
+    ADD  $f16, $f12, $f16
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD  $f12, X, 0
+    ADD $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt  $r0, I, .L24
+    .align 3
+
+.L999:
+    fmov.s $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/swap_lasx.S b/kernel/loongarch64/swap_lasx.S
new file mode 100644
index 000000000..4767fffe3
--- /dev/null
+++ b/kernel/loongarch64/swap_lasx.S
@@ -0,0 +1,401 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and INCY==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0
+    xvld VX2, Y, 0
+    addi.d  I, I, -1
+    xvst VX2, X, 0
+    xvst VX0, Y, 0
+#ifdef DOUBLE
+    xvld VX0, X, 32
+    xvld VX2, Y, 32
+    xvst VX2, X, 32
+    xvst VX0, Y, 32
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    ld.d t1, Y, 0
+    xvstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    xvstelm.d VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0
+    xvstelm.d VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    xvstelm.d VX0, Y, 0, 3
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    add.d Y, Y, INCY
+    xvst VX2, X, 0
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0
+    xvstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    xvstelm.d VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0
+    xvstelm.d VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    xvstelm.d VX1, Y, 0, 3
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    add.d Y, Y, INCY
+    xvst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    xvld VX0, X, 0
+    ld.w t1, Y, 0
+    xvstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    xvstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    xvstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    xvstelm.w VX0, Y, 0, 3
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0
+    xvstelm.w VX0, Y, 0, 4
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    xvstelm.w VX0, Y, 0, 5
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    xvstelm.w VX0, Y, 0, 6
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    xvstelm.w VX0, Y, 0, 7
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvst VX2, X, 0
+    addi.d X, X, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d  X, X, SIZE
+    add.d   Y, Y, INCY
+    blt   $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0
+    ld.d t1, X, 0
+    xvstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    xvstelm.d VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    xvstelm.d VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    xvstelm.d VX2, X, 0, 3
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    add.d X, X, INCX
+    xvst VX0, Y, 0
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0
+    xvstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    xvstelm.d VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    xvstelm.d VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    xvstelm.d VX3, X, 0, 3
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvst VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX2, Y, 0
+    ld.w t1, X, 0
+    xvstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    xvstelm.w VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    xvstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    xvstelm.w VX2, X, 0, 3
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0
+    xvstelm.w VX2, X, 0, 4
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    xvstelm.w VX2, X, 0, 5
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    xvstelm.w VX2, X, 0, 6
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    xvstelm.w VX2, X, 0, 7
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvst VX0, Y, 0
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    addi.d Y, Y, SIZE
+    blt  $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    add.d Y, Y, INCY
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    a2, X, 0
+    add.d X, X, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    a3, X, 0
+    add.d X, X, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt  $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/swap_lsx.S b/kernel/loongarch64/swap_lsx.S
new file mode 100644
index 000000000..736187f93
--- /dev/null
+++ b/kernel/loongarch64/swap_lsx.S
@@ -0,0 +1,431 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+#define N      $r4
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r17
+#define TEMP   $r18
+#define XX     $r5
+#define YY     $r6
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define b1     $f16
+#define b2     $f17
+#define b3     $f18
+#define b4     $f19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
+    slli.d  INCX, INCX, BASE_SHIFT
+    slli.d  INCY, INCY, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+/* INCX==1 and incy==1 */
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, Y, 0
+    vld VX3, Y, 16
+    addi.d  I, I, -1
+    vst VX2, X, 0
+    vst VX3, X, 16
+    vst VX0, Y, 0
+    vst VX1, Y, 16
+#ifdef DOUBLE
+    vld VX0, X, 32
+    vld VX1, X, 48
+    vld VX2, Y, 32
+    vld VX3, Y, 48
+    vst VX2, X, 32
+    vst VX3, X, 48
+    vst VX0, Y, 32
+    vst VX1, Y, 48
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+#ifdef DOUBLE
+    fld.d $f12, X, 0
+    fld.d $f14, Y, 0
+    addi.d I, I, -1
+    fst.d $f12, Y, 0
+    fst.d $f14, X, 0
+#else
+    fld.s $f12, X, 0
+    fld.s $f14, Y, 0
+    addi.d I, I, -1
+    fst.s $f12, Y, 0
+    fst.s $f14, X, 0
+#endif
+    addi.d  X, X, SIZE
+    addi.d  Y, Y, SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+/* INCX==1 and INCY!=1 */
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    ld.d t1, Y, 0
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 0
+    vld VX1, X, 2 * SIZE
+    ld.d t3, Y, 0
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    ld.d t1, Y, 0
+    vstelm.d VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t2, Y, 0
+    vstelm.d VX0, Y, 0, 1
+    vinsgr2vr.d VX2, t1, 0
+    vinsgr2vr.d VX2, t2, 1
+    add.d Y, Y, INCY
+    vst VX2, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t3, Y, 0
+    vstelm.d VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.d t4, Y, 0
+    vstelm.d VX1, Y, 0, 1
+    vinsgr2vr.d VX3, t3, 0
+    vinsgr2vr.d VX3, t4, 1
+    add.d Y, Y, INCY
+    vst VX3, X, 6 * SIZE
+    addi.d X, X, 8 * SIZE
+#else
+    vld VX0, X, 0
+    ld.w t1, Y, 0
+    vstelm.w VX0, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    vstelm.w VX0, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    vstelm.w VX0, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    vstelm.w VX0, Y, 0, 3
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vst VX2, X, 0
+
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0
+    vstelm.w VX1, Y, 0, 0
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0
+    vstelm.w VX1, Y, 0, 1
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0
+    vstelm.w VX1, Y, 0, 2
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0
+    vstelm.w VX1, Y, 0, 3
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vst VX3, X, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    addi.d X, X, SIZE
+    add.d  Y, Y, INCY
+    blt  $r0, I, .L123
+    b .L999
+    .align 3
+
+/* INCX!=1 and INCY==1 */
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0
+    ld.d t1, X, 0
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 0
+    vld VX3, Y, 2 * SIZE
+    ld.d t3, X, 0
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst VX1, Y, 2 * SIZE
+    vld VX2, Y, 4 * SIZE
+    ld.d t1, X, 0
+    vstelm.d VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    vstelm.d VX2, X, 0, 1
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    add.d X, X, INCX
+    vst VX0, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t3, X, 0
+    vstelm.d VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    vstelm.d VX3, X, 0, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vst  VX1, Y, 6 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    vld VX2, Y, 0
+    ld.w t1, X, 0
+    vstelm.w VX2, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    vstelm.w VX2, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    vstelm.w VX2, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    vstelm.w VX2, X, 0, 3
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vst VX0, Y, 0
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0
+    vstelm.w VX3, X, 0, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    vstelm.w VX3, X, 0, 1
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    vstelm.w VX3, X, 0, 2
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    vstelm.w VX3, X, 0, 3
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vst  VX1, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
+    addi.d I, I, -1
+    ST  $f12, Y, 0 * SIZE
+    ST  $f14, X, 0 * SIZE
+    add.d  X, X, INCX
+    addi.d Y, Y, SIZE
+    blt  $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+    move XX, X
+
+.L222:
+    LD    a1, X, 0
+    add.d X, X, INCX
+    LD    a2, X, 0
+    add.d X, X, INCX
+    LD    a3, X, 0
+    add.d X, X, INCX
+    LD    a4, X, 0
+    add.d X, X, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    add.d Y, Y, INCY
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    LD    a1, X, 0
+    add.d X, X, INCX
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    LD    b1, Y, 0
+    ST    a1, Y, 0
+    add.d Y, Y, INCY
+    LD    a2, X, 0
+    add.d X, X, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    LD    b2, Y, 0
+    ST    a2, Y, 0
+    add.d Y, Y, INCY
+    LD    a3, X, 0
+    add.d X, X, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    LD    b3, Y, 0
+    ST    a3, Y, 0
+    LD    a4, X, 0
+    add.d X, X, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    LD    b4, Y, 0
+    ST    a4, Y, 0
+    add.d Y, Y, INCY
+    ST    b1, XX, 0
+    add.d XX, XX, INCX
+    ST    b2, XX, 0
+    add.d XX, XX, INCX
+    ST    b3, XX, 0
+    add.d XX, XX, INCX
+    ST    b4, XX, 0
+    add.d XX, XX, INCX
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  $f12, X, 0
+    LD  $f14, Y, 0
+    addi.d I, I, -1
+    ST  $f12, Y, 0
+    ST  $f14, X, 0
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/zgemm_kernel_2x2.S b/kernel/loongarch64/zgemm_kernel_2x2.S
new file mode 100644
index 000000000..589d170c5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_2x2.S
@@ -0,0 +1,848 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+#define c51    $f26
+#define c52    $f27
+#define c61    $f28
+#define c62    $f29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -88
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    ST         $f23,   $sp,   32
+    ST         $f24,   $sp,   40
+    ST         $f25,   $sp,   48
+    ST         $f26,   $sp,   56
+    ST         $f27,   $sp,   64
+    ST         $f28,   $sp,   72
+    ST         $f29,   $sp,   80
+
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x05
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MOV        c12,    c11
+    MOV        c21,    c11
+    MOV        c22,    c11
+    MOV        c31,    c11
+    MOV        c32,    c11
+    MOV        c41,    c11
+    MOV        c42,    c11
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x10        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x18        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+    LD         b3,     B0,    0x10        //load6
+    MADD1      c31,    a1,    b3,     c31  //res4
+    MADD2      c32,    a2,    b3,     c32  //res5
+    LD         b4,     B0,    0x18        //load7
+    MADD3      c31,    a2,    b4,     c31
+    MADD4      c32,    a1,    b4,     c32
+    MADD1      c41,    a3,    b3,     c41  //res6
+    MADD2      c42,    a4,    b3,     c42  //res7
+    MADD3      c41,    a4,    b4,     c41
+    MADD4      c42,    a3,    b4,     c42
+    LD         a5,     A0,    0x20        //load8
+    LD         b5,     B0,    0x20        //load9
+    MADD1      c11,    a5,    b5,     c11
+    LD         a6,     A0,    0x28        //load10
+    MADD2      c12,    a6,    b5,     c12
+    LD         b6,     B0,    0x28        //load11
+    MADD3      c11,    a6,    b6,     c11
+    MADD4      c12,    a5,    b6,     c12
+    LD         a7,     A0,    0x30        //load12
+    MADD1      c21,    a7,    b5,     c21
+    LD         a8,     A0,    0x38        //load13
+    MADD2      c22,    a8,    b5,     c22
+    MADD3      c21,    a8,    b6,     c21
+    MADD4      c22,    a7,    b6,     c22
+    LD         b7,     B0,    0x30        //load14
+    MADD1      c31,    a5,    b7,     c31
+    MADD2      c32,    a6,    b7,     c32
+    LD         b8,     B0,    0x38        //load15
+    MADD3      c31,    a6,    b8,     c31
+    MADD4      c32,    a5,    b8,     c32
+    MADD1      c41,    a7,    b7,     c41
+    MADD2      c42,    a8,    b7,     c42
+    MADD3      c41,    a8,    b8,     c41
+    MADD4      c42,    a7,    b8,     c42
+
+    LD         a1,     A0,    0x40        //load0
+    LD         b1,     B0,    0x40        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x48        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x48        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x50        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x58        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+    LD         b3,     B0,    0x50        //load6
+    MADD1      c31,    a1,    b3,     c31  //res4
+    MADD2      c32,    a2,    b3,     c32  //res5
+    LD         b4,     B0,    0x58        //load7
+    MADD3      c31,    a2,    b4,     c31
+    MADD4      c32,    a1,    b4,     c32
+    MADD1      c41,    a3,    b3,     c41  //res6
+    MADD2      c42,    a4,    b3,     c42  //res7
+    MADD3      c41,    a4,    b4,     c41
+    MADD4      c42,    a3,    b4,     c42
+    LD         a5,     A0,    0x60        //load8
+    LD         b5,     B0,    0x60        //load9
+    MADD1      c11,    a5,    b5,     c11
+    LD         a6,     A0,    0x68        //load10
+    MADD2      c12,    a6,    b5,     c12
+    LD         b6,     B0,    0x68        //load11
+    MADD3      c11,    a6,    b6,     c11
+    MADD4      c12,    a5,    b6,     c12
+    LD         a7,     A0,    0x70        //load12
+    MADD1      c21,    a7,    b5,     c21
+    LD         a8,     A0,    0x78        //load13
+    MADD2      c22,    a8,    b5,     c22
+    MADD3      c21,    a8,    b6,     c21
+    MADD4      c22,    a7,    b6,     c22
+    LD         b7,     B0,    0x70        //load14
+    MADD1      c31,    a5,    b7,     c31
+    MADD2      c32,    a6,    b7,     c32
+    LD         b8,     B0,    0x78        //load15
+    MADD3      c31,    a6,    b8,     c31
+    MADD4      c32,    a5,    b8,     c32
+    MADD1      c41,    a7,    b7,     c41
+    MADD2      c42,    a8,    b7,     c42
+    MADD3      c41,    a8,    b8,     c41
+    MADD4      c42,    a7,    b8,     c42
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x80
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x10        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x18        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+    LD         b3,     B0,    0x10        //load6
+    MADD1      c31,    a1,    b3,     c31  //res4
+    MADD2      c32,    a2,    b3,     c32  //res5
+    LD         b4,     B0,    0x18        //load7
+    MADD3      c31,    a2,    b4,     c31
+    MADD4      c32,    a1,    b4,     c32
+    MADD1      c41,    a3,    b3,     c41  //res6
+    MADD2      c42,    a4,    b3,     c42  //res7
+    MADD3      c41,    a4,    b4,     c41
+    MADD4      c42,    a3,    b4,     c42
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x10
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x18
+
+    MUL        b5,     c31,   ALPHA_R
+    MUL        b6,     c32,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c32,   ALPHA_R
+    MUL        b6,     c31,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x08
+
+    MUL        b7,     c41,   ALPHA_R
+    MUL        b8,     c42,   ALPHA_I
+    SUB        b7,     b7,    b8
+    ST         b7,     C1,    0x10
+
+    MUL        b7,     c42,   ALPHA_R
+    MUL        b8,     c41,   ALPHA_I
+    ADD        b8,     b7,    b8
+    ST         b8,     C1,    0x18
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         a7,     C0,    0x10    //C0[2]
+    LD         a8,     C0,    0x18    //C0[3]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x08    //C1[1]
+    LD         b7,     C1,    0x10    //C1[2]
+    LD         b8,     C1,    0x18    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+    ST         a7,     C0,    0x10
+    ST         a8,     C0,    0x18
+
+    MADD       b5,     c31,   ALPHA_R, b5
+    MADD       b6,     c32,   ALPHA_R, b6
+    NMSUB      b5,     c32,   ALPHA_I, b5
+    MADD       b6,     c31,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x08
+
+    MADD       b7,     c41,   ALPHA_R, b7
+    MADD       b8,     c42,   ALPHA_R, b8
+    NMSUB      b7,     c42,   ALPHA_I, b7
+    MADD       b8,     c41,   ALPHA_I, b8
+    ST         b7,     C1,    0x10
+    ST         b8,     C1,    0x18
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x05
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x10        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x18        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x08    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x05
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x10        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x18        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x10
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x18
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         a7,     C0,    0x10    //C1[2]
+    LD         a8,     C0,    0x18    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+    ST         a7,     C0,    0x10
+    ST         a8,     C0,    0x18
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x08        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x08        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LD         $f23,   $sp,   32
+    LD         $f24,   $sp,   40
+    LD         $f25,   $sp,   48
+    LD         $f26,   $sp,   56
+    LD         $f27,   $sp,   64
+    LD         $f28,   $sp,   72
+    LD         $f29,   $sp,   80
+
+    addi.d     $sp,    $sp,   88
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_2x2_lasx.S b/kernel/loongarch64/zgemm_kernel_2x2_lasx.S
new file mode 100644
index 000000000..152190d4f
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_2x2_lasx.S
@@ -0,0 +1,822 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define VALPHAR $xr28
+#define VALPHAI $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.d  VALPHAR, $sp, 112
+    xvldrepl.d  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x05
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    xvxor.v    U1,     U1,   U1
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    xvld       D0,     A0,    0x00  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    xvld       D0,     A0,    0x20  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x20  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    xvld       D0,     A0,    0x40  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x40  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    xvld       D0,     A0,    0x60  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x60  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x80
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    xvld       D0,     A0,    0x00  //a0 a1 a2 a3
+    xvld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    xvpermi.d  D4,     D1,    0xa0    //b0 b0 b2 b2
+    xvpermi.d  D5,     D1,    0xf5    //b1 b1 b3 b3
+    xvpermi.d  D2,     D0,    0x88    //a0 a2 a0 a2
+    xvpermi.d  D3,     D0,    0xdd    //a1 a3 a1 a3
+
+    XVMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    XVMADD2      U1,     D3,    D4,     U1  //res1 3 5 7
+    XVMADD3      U0,     D3,    D5,     U0
+    XVMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    xvld       U8,     C0,    0x00  //0 1 2 3
+    xvld       U9,     C1,    0x00  //4 5 6 7
+
+    xvpackev.d U10,    U9,    U8
+    xvpermi.d  U10,    U10,   0xd8  //0 2 4 6
+    xvpackod.d U11,    U9,    U8
+    xvpermi.d  U11,    U11,   0xd8  //1 3 5 7
+
+    xvfmul.d      U10,    U0,    VALPHAR
+    xvfmul.d      U11,    U1,    VALPHAR
+    XVNMSUB     U10,    U1,    VALPHAI, U10
+    XVFMADD      U11,    U0,    VALPHAI, U11
+
+    xvand.v    U8,     U10,   U10  //0 2 4 6
+    xvpermi.q  U8,     U11,   0x02 //0 2 1 3
+    xvpermi.d  U8,     U8,    0xd8 //0 1 2 3
+
+    xvand.v    U9,     U11,   U11  //1 3 5 7
+    xvpermi.q  U9,     U10,   0x31 //4 6 5 7
+    xvpermi.d  U9,     U9,    0xd8 //4 5 6 7
+
+    xvst       U8,     C0,    0x00
+    xvst       U9,     C1,    0x00
+#else
+    xvld       U8,     C0,    0x00  //0 1 2 3
+    xvld       U9,     C1,    0x00  //4 5 6 7
+
+    xvpackev.d U10,    U9,    U8
+    xvpermi.d  U10,    U10,   0xd8  //0 2 4 6
+    xvpackod.d U11,    U9,    U8
+    xvpermi.d  U11,    U11,   0xd8  //1 3 5 7
+
+    XVFMADD      U10,    U0,    VALPHAR, U10
+    XVFMADD      U11,    U1,    VALPHAR, U11
+    XVNMSUB     U10,    U1,    VALPHAI, U10
+    XVFMADD      U11,    U0,    VALPHAI, U11
+
+    xvand.v    U8,     U10,   U10  //0 2 4 6
+    xvpermi.q  U8,     U11,   0x02 //0 2 1 3
+    xvpermi.d  U8,     U8,    0xd8 //0 1 2 3
+
+    xvand.v    U9,     U11,   U11  //1 3 5 7
+    xvpermi.q  U9,     U10,   0x31 //4 6 5 7
+    xvpermi.d  U9,     U9,    0xd8 //4 5 6 7
+
+    xvst       U8,     C0,    0x00
+    xvst       U9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x05
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    LD         a2,     A0,    0x08        //load2
+    LD         b2,     B0,    0x08        //load3
+    LD         b3,     B0,    0x10        //load4
+    LD         b4,     B0,    0x18        //load5
+
+    MADD1      c11,    a1,    b1,     c11  //res0
+    MADD2      c12,    a2,    b1,     c12  //res1
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x08    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x05
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x05
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    LD         a2,     A0,    0x08        //load2
+    LD         b2,     B0,    0x08        //load3
+    LD         a3,     A0,    0x10        //load4
+    LD         a4,     A0,    0x18        //load5
+
+    MADD1      c11,    a1,    b1,     c11  //res0
+    MADD2      c12,    a2,    b1,     c12  //res1
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    MADD1      c21,    a3,    b1,     c21  //res2
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x10
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x18
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+    LD         a7,     C0,    0x10    //C1[2]
+    LD         a8,     C0,    0x18    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    ST         a7,     C0,    0x10
+    ST         a8,     C0,    0x18
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x05
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x20
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    LD         a2,     A0,    0x08        //load2
+    LD         b2,     B0,    0x08        //load3
+
+    MADD1      c11,    a1,    b1,     c11  //res0
+    MADD2      c12,    a2,    b1,     c12  //res1
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_4x4_lsx.S b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S
new file mode 100644
index 000000000..6c4841b24
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_4x4_lsx.S
@@ -0,0 +1,2316 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LSX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define D12    $vr28
+#define D13    $vr29
+#define VALPHAR $vr28
+#define VALPHAI $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.d  VALPHAR, $sp, 112
+    vldrepl.d  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     2  //bm/4
+    beq        I,      T0,    .L18
+
+.L11:  /* for(i=0; i<bm/4; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x06
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+    vxor.v    U8,     U8,   U8
+    vxor.v    U9,     U9,   U9
+    vxor.v    U10,    U10,  U10
+    vxor.v    U11,    U11,  U11
+    vxor.v    U12,    U12,  U12
+    vxor.v    U13,    U13,  U13
+    vxor.v    U14,    U14,  U14
+    vxor.v    U15,    U15,  U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D3,     B0,    0x20  // b2ri
+    vld       D4,     B0,    0x30  // b3ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    vpackev.d D9,     D4,    D3     //b2r b3r
+    vpackod.d D10,    D4,    D3     //b2i b3i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    VMADD1    U2,     D5,    D9,     U2  //20r 30r
+    VMADD2    U3,     D6,    D9,     U3  //20i 30i
+    VMADD3    U2,     D6,    D10,    U2
+    VMADD4    U3,     D5,    D10,    U3
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U4,     D5,    D7,     U4  //01r 11r
+    VMADD2    U5,     D6,    D7,     U5  //01i 11i
+    VMADD3    U4,     D6,    D8,     U4
+    VMADD4    U5,     D5,    D8,     U5
+
+    VMADD1    U6,     D5,    D9,     U6  //21r 31r
+    VMADD2    U7,     D6,    D9,     U7  //21i 31i
+    VMADD3    U6,     D6,    D10,    U6
+    VMADD4    U7,     D5,    D10,    U7
+
+    vld       D0,     A0,    0x20  // a2ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a2rr
+    vshuf4i.d  D6,     D0,    0x55  //a2ii
+
+    VMADD1    U8,     D5,    D7,     U8  //02r 12r
+    VMADD2    U9,     D6,    D7,     U9  //02i 12i
+    VMADD3    U8,     D6,    D8,     U8
+    VMADD4    U9,     D5,    D8,     U9
+
+    VMADD1    U10,     D5,    D9,     U10  //22r 32r
+    VMADD2    U11,     D6,    D9,     U11  //22i 32i
+    VMADD3    U10,     D6,    D10,    U10
+    VMADD4    U11,     D5,    D10,    U11
+
+    vld       D0,     A0,    0x30  // a3ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a3rr
+    vshuf4i.d  D6,     D0,    0x55  //a3ii
+
+    VMADD1    U12,     D5,    D7,     U12  //03r 13r
+    VMADD2    U13,     D6,    D7,     U13  //03i 13i
+    VMADD3    U12,     D6,    D8,     U12
+    VMADD4    U13,     D5,    D8,     U13
+
+    VMADD1    U14,     D5,    D9,     U14  //23r 33r
+    VMADD2    U15,     D6,    D9,     U15  //23i 33i
+    VMADD3    U14,     D6,    D10,    U14
+    VMADD4    U15,     D5,    D10,    U15
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U4,    VALPHAR
+    vfmul.d      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U6,    VALPHAR
+    vfmul.d      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U8,    VALPHAR
+    vfmul.d      D3,    U9,    VALPHAR
+    VNMSUB      D2,    U9,    VALPHAI, D2
+    VFMADD      D3,    U8,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res22 res32
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U10,    VALPHAR
+    vfmul.d      D3,    U11,    VALPHAR
+    VNMSUB      D2,    U11,    VALPHAI, D2
+    VFMADD      D3,    U10,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U12,    VALPHAR
+    vfmul.d      D3,    U13,    VALPHAR
+    VNMSUB      D2,    U13,    VALPHAI, D2
+    VFMADD      D3,    U12,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res23 res33
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U14,    VALPHAR
+    vfmul.d      D3,    U15,    VALPHAR
+    VNMSUB      D2,    U15,    VALPHAI, D2
+    VFMADD      D3,    U14,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vst       U0,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U1,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U2,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U3,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U4,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U5,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U6,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U7,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U8,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U9,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U10,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U11,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U12,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U13,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U14,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vst       U15,     C0,    0x00
+    fld.d     $f27,  C0,    0x00
+    fld.d     $f27,  C0,    0x08
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U8,    VALPHAR, D2
+    VFMADD      D3,    U9,    VALPHAR, D3
+    VNMSUB      D2,    U9,    VALPHAI, D2
+    VFMADD      D3,    U8,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res22 res32
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U10,    VALPHAR, D2
+    VFMADD      D3,    U11,    VALPHAR, D3
+    VNMSUB      D2,    U11,    VALPHAI, D2
+    VFMADD      D3,    U10,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U12,    VALPHAR, D2
+    VFMADD      D3,    U13,    VALPHAR, D3
+    VNMSUB      D2,    U13,    VALPHAI, D2
+    VFMADD      D3,    U12,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res23 res33
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U14,    VALPHAR, D2
+    VFMADD      D3,    U15,    VALPHAR, D3
+    VNMSUB      D2,    U15,    VALPHAI, D2
+    VFMADD      D3,    U14,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D3,     B0,    0x20  // b2ri
+    vld       D4,     B0,    0x30  // b3ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    vpackev.d D9,     D4,    D3     //b2r b3r
+    vpackod.d D10,    D4,    D3     //b2i b3i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    VMADD1    U2,     D5,    D9,     U2  //20r 30r
+    VMADD2    U3,     D6,    D9,     U3  //20i 30i
+    VMADD3    U2,     D6,    D10,    U2
+    VMADD4    U3,     D5,    D10,    U3
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U4,     D5,    D7,     U4  //01r 11r
+    VMADD2    U5,     D6,    D7,     U5  //01i 11i
+    VMADD3    U4,     D6,    D8,     U4
+    VMADD4    U5,     D5,    D8,     U5
+
+    VMADD1    U6,     D5,    D9,     U6  //21r 31r
+    VMADD2    U7,     D6,    D9,     U7  //21i 31i
+    VMADD3    U6,     D6,    D10,    U6
+    VMADD4    U7,     D5,    D10,    U7
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U4,    VALPHAR
+    vfmul.d      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U6,    VALPHAR
+    vfmul.d      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res21 res31
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D3,     B0,    0x20  // b2ri
+    vld       D4,     B0,    0x30  // b3ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    vpackev.d D9,     D4,    D3     //b2r b3r
+    vpackod.d D10,    D4,    D3     //b2i b3i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    VMADD1    U2,     D5,    D9,     U2  //20r 30r
+    VMADD2    U3,     D6,    D9,     U3  //20i 30i
+    VMADD3    U2,     D6,    D10,    U2
+    VMADD4    U3,     D5,    D10,    U3
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res20 res30
+    vld       D0,     C2,    0x00 //c2: 0 1
+    vld       D1,     C3,    0x00 //c3: 0 1
+
+    vpackev.d D2,     D1,    D0   //c2[0] c3[0]
+    vpackod.d D3,     D1,    D0   //c2[1] c3[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c2[0] c2[1]
+    vpackod.d D5,     D3,    D2   //c3[0] c3[1]
+
+    vst        D4,     C2,    0x00
+    vst        D5,     C3,    0x00
+
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     C3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x06
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     2  //bm/4
+    beq        I,      T0,    .L280
+
+.L21:  /* for (i=0; i<bm/4; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+    vxor.v    U4,     U4,   U4
+    vxor.v    U5,     U5,   U5
+    vxor.v    U6,     U6,   U6
+    vxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U2,     D5,    D7,     U2  //01r 11r
+    VMADD2    U3,     D6,    D7,     U3  //01i 11i
+    VMADD3    U2,     D6,    D8,     U2
+    VMADD4    U3,     D5,    D8,     U3
+
+    vld       D0,     A0,    0x20  // a2ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a2rr
+    vshuf4i.d  D6,     D0,    0x55  //a2ii
+
+    VMADD1    U4,     D5,    D7,     U4  //02r 12r
+    VMADD2    U5,     D6,    D7,     U5  //02i 12i
+    VMADD3    U4,     D6,    D8,     U4
+    VMADD4    U5,     D5,    D8,     U5
+
+    vld       D0,     A0,    0x30  // a3ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a3rr
+    vshuf4i.d  D6,     D0,    0x55  //a3ii
+
+    VMADD1    U6,     D5,    D7,     U6  //03r 13r
+    VMADD2    U7,     D6,    D7,     U7  //03i 13i
+    VMADD3    U6,     D6,    D8,     U6
+    VMADD4    U7,     D5,    D8,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U4,    VALPHAR
+    vfmul.d      D3,    U5,    VALPHAR
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U6,    VALPHAR
+    vfmul.d      D3,    U7,    VALPHAR
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U4,    VALPHAR, D2
+    VFMADD      D3,    U5,    VALPHAR, D3
+    VNMSUB      D2,    U5,    VALPHAI, D2
+    VFMADD      D3,    U4,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U6,    VALPHAR, D2
+    VFMADD      D3,    U7,    VALPHAR, D3
+    VNMSUB      D2,    U7,    VALPHAI, D2
+    VFMADD      D3,    U6,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    vld       D0,     A0,    0x10  // a1ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a1rr
+    vshuf4i.d  D6,     D0,    0x55  //a1ii
+
+    VMADD1    U2,     D5,    D7,     U2  //01r 11r
+    VMADD2    U3,     D6,    D7,     U3  //01i 11i
+    VMADD3    U2,     D6,    D8,     U2
+    VMADD4    U3,     D5,    D8,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+    vld       D2,     B0,    0x10  // b1ri
+    vld       D0,     A0,    0x00  // a0ri
+
+    vand.v     D5,     D0,    D0
+    vand.v     D6,     D0,    D0
+    vshuf4i.d  D5,     D0,    0x00  //a0rr
+    vshuf4i.d  D6,     D0,    0x55  //a0ii
+
+    vpackev.d D7,     D2,    D1     //b0r b1r
+    vpackod.d D8,     D2,    D1     //b0i b1i
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 10r
+    VMADD2    U1,     D6,    D7,     U1  //00i 10i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C1,    0x00 //c1: 0 1
+
+    vpackev.d D2,     D1,    D0   //c0[0] c1[0]
+    vpackod.d D3,     D1,    D0   //c0[1] c1[1]
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0[0] c0[1]
+    vpackod.d D5,     D3,    D2   //c1[0] c1[1]
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C1,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     5
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     2  //bm/4
+    beq        I,      T0,    .L38
+
+.L31:  /* for (i=0; i<bm/4; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+    vxor.v    U2,     U2,   U2
+    vxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+
+    vld       D0,     A0,    0x00  // a0ri
+    vld       D2,     A0,    0x10  // a1ri
+
+    vpackev.d  D5,     D2,    D0  //a0r a1r
+    vpackod.d  D6,     D2,    D0  //a0i a1i
+
+    vand.v     D7,     D1,    D1
+    vand.v     D8,     D1,    D1
+    vshuf4i.d  D7,     D1,    0x00  //b0rr
+    vshuf4i.d  D8,     D1,    0x55  //b0ii
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 01r
+    VMADD2    U1,     D6,    D7,     U1  //00i 01i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    vld       D0,     A0,    0x20  // a0ri
+    vld       D2,     A0,    0x30  // a1ri
+
+    vpackev.d  D5,     D2,    D0  //a0r a1r
+    vpackod.d  D6,     D2,    D0  //a0i a1i
+
+    VMADD1    U2,     D5,    D7,     U2  //02r 03r
+    VMADD2    U3,     D6,    D7,     U3  //02i 03i
+    VMADD3    U2,     D6,    D8,     U2
+    VMADD4    U3,     D5,    D8,     U3
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+
+    //res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    vfmul.d      D2,    U2,    VALPHAR
+    vfmul.d      D3,    U3,    VALPHAR
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+
+    //res02 res03
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    VFMADD      D2,    U2,    VALPHAR, D2
+    VFMADD      D3,    U3,    VALPHAR, D3
+    VNMSUB      D2,    U3,    VALPHAI, D2
+    VFMADD      D3,    U2,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L31
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    vld       D1,     B0,    0x00  // b0ri
+
+    vld       D0,     A0,    0x00  // a0ri
+    vld       D2,     A0,    0x10  // a1ri
+
+    vpackev.d  D5,     D2,    D0  //a0r a1r
+    vpackod.d  D6,     D2,    D0  //a0i a1i
+
+    vand.v     D7,     D1,    D1
+    vand.v     D8,     D1,    D1
+    vshuf4i.d  D7,     D1,    0x00  //b0rr
+    vshuf4i.d  D8,     D1,    0x55  //b0ii
+
+    VMADD1    U0,     D5,    D7,     U0  //00r 01r
+    VMADD2    U1,     D6,    D7,     U1  //00i 01i
+    VMADD3    U0,     D6,    D8,     U0
+    VMADD4    U1,     D5,    D8,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    vfmul.d      D2,    U0,    VALPHAR
+    vfmul.d      D3,    U1,    VALPHAR
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01
+    vld       D0,     C0,    0x00 //c0: 0 1
+    vld       D1,     C0,    0x10 //c0: 2 3
+
+    vpackev.d D2,     D1,    D0   //c0: 0 2
+    vpackod.d D3,     D1,    D0   //c0: 1 3
+
+    VFMADD      D2,    U0,    VALPHAR, D2
+    VFMADD      D3,    U1,    VALPHAR, D3
+    VNMSUB      D2,    U1,    VALPHAI, D2
+    VFMADD      D3,    U0,    VALPHAI, D3
+
+    vpackev.d D4,     D3,    D2   //c0: 0 1
+    vpackod.d D5,     D3,    D2   //c0: 2 3
+
+    vst        D4,     C0,    0x00
+    vst        D5,     C0,    0x10
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x08
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x08
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x08
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_kernel_8x4_lasx.S b/kernel/loongarch64/zgemm_kernel_8x4_lasx.S
new file mode 100644
index 000000000..ca90b30f5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_kernel_8x4_lasx.S
@@ -0,0 +1,3545 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+#define T2     $r26
+#define T3     $r27
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define D12    $xr28
+#define D13    $xr29
+#define VALPHAR $xr28
+#define VALPHAI $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.d  VALPHAR, $sp, 112
+    xvldrepl.d  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   BASE_SHIFT
+
+    move       J,      $r0
+    srai.d     T0,     N,     2  //bn/4
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/4; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    add.d      C2,     C1,    TL
+    add.d      C3,     C2,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     T3,     OFF,   0x07
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x06
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+    xvxor.v    U8,     U8,   U8
+    xvxor.v    U9,     U9,   U9
+    xvxor.v    U10,    U10,  U10
+    xvxor.v    U11,    U11,  U11
+    xvxor.v    U12,    U12,  U12
+    xvxor.v    U13,    U13,  U13
+    xvxor.v    U14,    U14,  U14
+    xvxor.v    U15,    U15,  U15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L13
+    blt        TL,     L,     .L13
+
+.L12:  /* for(k=0; k<temp; k+=1) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvpermi.d  D4,     D0,    0xaa  //a1r
+    xvpermi.d  D5,     D0,    0xff  //a1i
+
+    XVMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    XVMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x20  // a2ri a3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a2r
+    xvpermi.d  D5,     D0,    0x55  //a2i
+
+    XVMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    XVMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvpermi.d  D4,     D0,    0xaa  //a3r
+    xvpermi.d  D5,     D0,    0xff  //a3i
+
+    XVMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    XVMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    xvld       D0,     A0,    0x40  // a4ri a5ri
+
+    xvpermi.d  D4,     D0,    0x00  //a4r
+    xvpermi.d  D5,     D0,    0x55  //a4i
+
+    XVMADD1    U8,     D4,    D6,     U8  //04r 14r 24r 34r
+    XVMADD2    U9,     D5,    D6,     U9  //04i 14i 24i 34i
+    XVMADD3    U8,     D5,    D7,     U8
+    XVMADD4    U9,     D4,    D7,     U9
+
+    xvpermi.d  D4,     D0,    0xaa  //a5r
+    xvpermi.d  D5,     D0,    0xff  //a5i
+
+    XVMADD1    U10,     D4,    D6,     U10  //05r 15r 25r 35r
+    XVMADD2    U11,     D5,    D6,     U11  //05i 15i 25i 35i
+    XVMADD3    U10,     D5,    D7,     U10
+    XVMADD4    U11,     D4,    D7,     U11
+
+    xvld       D0,     A0,    0x60  // a6ri a7ri
+
+    xvpermi.d  D4,     D0,    0x00  //a6r
+    xvpermi.d  D5,     D0,    0x55  //a6i
+
+    XVMADD1    U12,     D4,    D6,     U12  //06r 16r 26r 36r
+    XVMADD2    U13,     D5,    D6,     U13  //06i 16i 26i 36i
+    XVMADD3    U12,     D5,    D7,     U12
+    XVMADD4    U13,     D4,    D7,     U13
+
+    xvpermi.d  D4,     D0,    0xaa  //a5r
+    xvpermi.d  D5,     D0,    0xff  //a5i
+
+    XVMADD1    U14,     D4,    D6,     U14  //07r 17r 27r 37r
+    XVMADD2    U15,     D5,    D6,     U15  //07i 17i 27i 37i
+    XVMADD3    U14,     D5,    D7,     U14
+    XVMADD4    U15,     D4,    D7,     U15
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L12
+
+.L13:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U2,    VALPHAR
+    xvfmul.d      D7,    U3,    VALPHAR
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U4,    VALPHAR
+    xvfmul.d      D7,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U6,    VALPHAR
+    xvfmul.d      D7,    U7,    VALPHAR
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res04 res14 res24 res34
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U8,    VALPHAR
+    xvfmul.d      D7,    U9,    VALPHAR
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D7,    U8,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U10,    VALPHAR
+    xvfmul.d      D7,    U11,    VALPHAR
+    XVNMSUB      D6,    U11,    VALPHAI, D6
+    XVFMADD      D7,    U10,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res06 res16 res26 res36
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U12,    VALPHAR
+    xvfmul.d      D7,    U13,    VALPHAR
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D7,    U12,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U14,    VALPHAR
+    xvfmul.d      D7,    U15,    VALPHAR
+    XVNMSUB      D6,    U15,    VALPHAI, D6
+    XVFMADD      D7,    U14,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U2,    VALPHAR, D6
+    XVFMADD      D7,    U3,    VALPHAR, D7
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D7,    U5,    VALPHAR, D7
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U6,    VALPHAR, D6
+    XVFMADD      D7,    U7,    VALPHAR, D7
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res04 res14 res24 res34
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U8,    VALPHAR, D6
+    XVFMADD      D7,    U9,    VALPHAR, D7
+    XVNMSUB      D6,    U9,    VALPHAI, D6
+    XVFMADD      D7,    U8,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res05 res15 res25 res35
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U10,    VALPHAR, D6
+    XVFMADD      D7,    U11,    VALPHAR, D7
+    XVNMSUB      D6,    U11,    VALPHAI, D6
+    XVFMADD      D7,    U10,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res06 res16 res26 res36
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U12,    VALPHAR, D6
+    XVFMADD      D7,    U13,    VALPHAR, D7
+    XVNMSUB      D6,    U13,    VALPHAI, D6
+    XVFMADD      D7,    U12,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res07 res17 res27 res37
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U14,    VALPHAR, D6
+    XVFMADD      D7,    U15,    VALPHAR, D7
+    XVNMSUB      D6,    U15,    VALPHAI, D6
+    XVFMADD      D7,    U14,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     4
+    beq        I,      T0,    .L18
+
+.L15:  /* if (bm & 4) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+    xvxor.v    U4,     U4,   U4
+    xvxor.v    U5,     U5,   U5
+    xvxor.v    U6,     U6,   U6
+    xvxor.v    U7,     U7,   U7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvpermi.d  D4,     D0,    0xaa  //a1r
+    xvpermi.d  D5,     D0,    0xff  //a1i
+
+    XVMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    XVMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    xvld       D0,     A0,    0x20  // a2ri a3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a2r
+    xvpermi.d  D5,     D0,    0x55  //a2i
+
+    XVMADD1    U4,     D4,    D6,     U4  //02r 12r 22r 32r
+    XVMADD2    U5,     D5,    D6,     U5  //02i 12i 22i 32i
+    XVMADD3    U4,     D5,    D7,     U4
+    XVMADD4    U5,     D4,    D7,     U5
+
+    xvpermi.d  D4,     D0,    0xaa  //a3r
+    xvpermi.d  D5,     D0,    0xff  //a3i
+
+    XVMADD1    U6,     D4,    D6,     U6  //03r 13r 23r 33r
+    XVMADD2    U7,     D5,    D6,     U7  //03i 13i 23i 33i
+    XVMADD3    U6,     D5,    D7,     U6
+    XVMADD4    U7,     D4,    D7,     U7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U2,    VALPHAR
+    xvfmul.d      D7,    U3,    VALPHAR
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U4,    VALPHAR
+    xvfmul.d      D7,    U5,    VALPHAR
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U6,    VALPHAR
+    xvfmul.d      D7,    U7,    VALPHAR
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U2,    VALPHAR, D6
+    XVFMADD      D7,    U3,    VALPHAR, D7
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+
+    //res02 res12 res22 res32
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U4,    VALPHAR, D6
+    XVFMADD      D7,    U5,    VALPHAR, D7
+    XVNMSUB      D6,    U5,    VALPHAI, D6
+    XVFMADD      D7,    U4,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res03 res13 res23 res33
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U6,    VALPHAR, D6
+    XVFMADD      D7,    U7,    VALPHAR, D7
+    XVNMSUB      D6,    U7,    VALPHAI, D6
+    XVFMADD      D7,    U6,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L18:   /* if (bm & 2) */
+    move       I,      $r0
+    andi       T0,     M,     2
+    beq        I,      T0,    .L183
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L182
+    blt        TL,     L,     .L182
+
+.L181:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    xvpermi.d  D4,     D0,    0xaa  //a1r
+    xvpermi.d  D5,     D0,    0xff  //a1i
+
+    XVMADD1    U2,     D4,    D6,     U2  //01r 11r 21r 31r
+    XVMADD2    U3,     D5,    D6,     U3  //01i 11i 21i 31i
+    XVMADD3    U2,     D5,    D7,     U2
+    XVMADD4    U3,     D4,    D7,     U3
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L181
+
+.L182:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    xvfmul.d      D6,    U2,    VALPHAR
+    xvfmul.d      D7,    U3,    VALPHAR
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    //res01 res11 res21 res31
+    xvand.v    D4,     D1,    D1
+    xvpermi.q  D4,     D0,    0x31 //c0:2 3, c1:2 3
+    xvpermi.d  D6,     D4,    0xd8 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D7,     D4,    0x8d //c0[3] c1[3] c0[2] c1[2]
+
+    xvand.v    D5,     D3,    D3
+    xvpermi.q  D5,     D2,    0x31 //c2:2 3, c3:2 3
+    xvpermi.d  D8,     D5,    0xd8 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D9,     D5,    0x8d //c2[3] c3[3] c2[2] c3[2]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[2] c1[2] c2[2] c3[2]
+    xvpermi.q  D7,     D9,    0x02 //c0[3] c1[3] c2[3] c3[3]
+
+    XVFMADD      D6,    U2,    VALPHAR, D6
+    XVFMADD      D7,    U3,    VALPHAR, D7
+    XVNMSUB      D6,    U3,    VALPHAI, D6
+    XVFMADD      D7,    U2,    VALPHAI, D7
+
+    xvand.v    D4,     D6,    D6
+    xvpermi.q  D4,     D7,    0x02 //c0[2] c1[2] c0[3] c1[3]
+    xvpermi.d  D4,     D4,    0xd8 //c0[2] c0[3] c1[2] c1[3]
+
+    xvand.v    D5,     D7,    D7
+    xvpermi.q  D5,     D6,    0x31 //c2[2] c3[2] c2[3] c3[3]
+    xvpermi.d  D5,     D5,    0xd8 //c2[2] c2[3] c3[2] c3[3]
+
+    xvand.v    D0,     D10,    D10
+    xvand.v    D1,     D11,    D11
+
+    xvpermi.q  D0,     D4,     0x02 //c0: 0 1 2 3
+    xvpermi.q  D4,     D10,    0x31 //c1: 0 1 2 3
+    xvpermi.q  D1,     D5,     0x02 //c2: 0 1 2 3
+    xvpermi.q  D5,     D11,    0x31 //c3: 0 1 2 3
+
+    xvst       D0,     C0,    0x00
+    xvst       D4,     C1,    0x00
+    xvst       D1,     C2,    0x00
+    xvst       D5,     C3,    0x00
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+    addi.d     C2,     C2,    0x20
+    addi.d     C3,     C3,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L183:   /* if (bm & 1) */
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L186
+
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x06
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   4
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L185
+    blt        TL,     L,     .L185
+
+.L184:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+    xvld       D3,     B0,    0x20  // b2ri b3ri
+
+    xvpermi.d  D4,     D0,    0x00  //a0r
+    xvpermi.d  D5,     D0,    0x55  //a0i
+
+    xvpackev.d D6,     D3,    D2
+    xvpermi.d  D6,     D6,    0xd8  //b0r b1r b2r b3r
+
+    xvpackod.d D7,     D3,    D2
+    xvpermi.d  D7,     D7,    0xd8  //b0i b1i b2i b3i
+
+    XVMADD1    U0,     D4,    D6,     U0  //00r 10r 20r 30r
+    XVMADD2    U1,     D5,    D6,     U1  //00i 10i 20i 30i
+    XVMADD3    U0,     D5,    D7,     U0
+    XVMADD4    U1,     D4,    D7,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L184
+
+.L185:
+#if defined(TRMMKERNEL)
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    xvfmul.d      D6,    U0,    VALPHAR
+    xvfmul.d      D7,    U1,    VALPHAR
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    xvpermi.d  D8,     D10,   0x4e //c1[0] c1[1] c0[0] c0[1]
+    xvpermi.d  D9,     D11,   0x4e //c3[0] c3[1] c2[0] c2[1]
+
+    vst       $vr26,     C0,    0x00
+    vst       $vr24,     C1,    0x00
+    vst       $vr27,     C2,    0x00
+    vst       $vr25,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#else
+    //res00 res10 res20 res30
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+    xvld       D2,     C2,    0x00 //c2: 0 1 2 3
+    xvld       D3,     C3,    0x00 //c3: 0 1 2 3
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02 //c0:0 1, c1:0 1
+    xvpermi.d  D6,     D4,    0xd8 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D7,     D4,    0x8d //c0[1] c1[1] c0[0] c1[0]
+
+    xvand.v    D5,     D2,    D2
+    xvpermi.q  D5,     D3,    0x02 //c2:0 1, c3:0 1
+    xvpermi.d  D8,     D5,    0xd8 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D9,     D5,    0x8d //c2[1] c3[1] c2[0] c3[0]
+
+    xvpermi.q  D6,     D8,    0x02 //c0[0] c1[0] c2[0] c3[0]
+    xvpermi.q  D7,     D9,    0x02 //c0[1] c1[1] c2[1] c3[1]
+
+    XVFMADD      D6,    U0,    VALPHAR, D6
+    XVFMADD      D7,    U1,    VALPHAR, D7
+    XVNMSUB      D6,    U1,    VALPHAI, D6
+    XVFMADD      D7,    U0,    VALPHAI, D7
+
+    xvand.v    D10,     D6,    D6
+    xvpermi.q  D10,     D7,    0x02 //c0[0] c1[0] c0[1] c1[1]
+    xvpermi.d  D10,     D10,   0xd8 //c0[0] c0[1] c1[0] c1[1]
+
+    xvand.v    D11,     D7,    D7
+    xvpermi.q  D11,     D6,    0x31 //c2[0] c3[0] c2[1] c3[1]
+    xvpermi.d  D11,     D11,   0xd8 //c2[0] c2[1] c3[0] c3[1]
+
+    xvpermi.d  D8,     D10,   0x4e //c1[0] c1[1] c0[0] c0[1]
+    xvpermi.d  D9,     D11,   0x4e //c3[0] c3[1] c2[0] c2[1]
+
+    vst       $vr26,     C0,    0x00
+    vst       $vr24,     C1,    0x00
+    vst       $vr27,     C2,    0x00
+    vst       $vr25,     C3,    0x00
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+    addi.d     C2,     C2,    0x10
+    addi.d     C3,     C3,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -4
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x06
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+
+.L186:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   4
+#endif
+
+    slli.d     L,      K,     0x06
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x03
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     2
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     2
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&2); j+=2) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x05
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+    vxor.v    $vr2,      $vr2,    $vr2
+    vxor.v    $vr3,      $vr3,    $vr3
+    vxor.v    $vr4,      $vr4,    $vr4
+    vxor.v    $vr5,      $vr5,    $vr5
+    vxor.v    $vr6,      $vr6,    $vr6
+    vxor.v    $vr7,      $vr7,    $vr7
+    vxor.v    $vr8,      $vr8,    $vr8
+    vxor.v    $vr9,      $vr9,    $vr9
+    vxor.v    $vr10,      $vr10,    $vr10
+    vxor.v    $vr11,      $vr11,    $vr11
+    vxor.v    $vr12,      $vr12,    $vr12
+    vxor.v    $vr13,      $vr13,    $vr13
+    vxor.v    $vr14,      $vr14,    $vr14
+    vxor.v    $vr15,      $vr15,    $vr15
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+
+    vld       $vr18,     B0,    0x00  // b0ri
+    vld       $vr19,     B0,    0x10  // b1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a0rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a0ii
+
+    vand.v    $vr23,     $vr18,    $vr18
+    vshuf4i.d $vr23,     $vr19,    0x08 //b0r b1r
+    vshuf4i.d $vr18,     $vr19,    0x0d //b0i b1i
+
+    VMADD1    $vr30,     $vr21,    $vr23,     $vr30  //00r 10r
+    VMADD2    $vr31,     $vr22,    $vr23,     $vr31  //00i 10i
+    VMADD3    $vr30,     $vr22,    $vr18,     $vr30
+    VMADD4    $vr31,     $vr21,    $vr18,     $vr31
+
+    vld       $vr16,     A0,    0x10  // a1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a1rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a1ii
+
+    VMADD1    $vr2,     $vr21,    $vr23,     $vr2  //01r 11r
+    VMADD2    $vr3,     $vr22,    $vr23,     $vr3  //01i 11i
+    VMADD3    $vr2,     $vr22,    $vr18,     $vr2
+    VMADD4    $vr3,     $vr21,    $vr18,     $vr3
+
+    vld       $vr16,     A0,    0x20  // a2ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a2rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a2ii
+
+    VMADD1    $vr4,     $vr21,    $vr23,     $vr4  //02r 12r
+    VMADD2    $vr5,     $vr22,    $vr23,     $vr5  //02i 12i
+    VMADD3    $vr4,     $vr22,    $vr18,     $vr4
+    VMADD4    $vr5,     $vr21,    $vr18,     $vr5
+
+    vld       $vr16,     A0,    0x30  // a3ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a3rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a3ii
+
+    VMADD1    $vr6,     $vr21,    $vr23,     $vr6  //03r 13r
+    VMADD2    $vr7,     $vr22,    $vr23,     $vr7  //03i 13i
+    VMADD3    $vr6,     $vr22,    $vr18,     $vr6
+    VMADD4    $vr7,     $vr21,    $vr18,     $vr7
+
+    vld       $vr16,     A0,    0x40  // a4ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a4rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a4ii
+
+    VMADD1    $vr8,     $vr21,    $vr23,     $vr8  //04r 14r
+    VMADD2    $vr9,     $vr22,    $vr23,     $vr9  //04i 14i
+    VMADD3    $vr8,     $vr22,    $vr18,     $vr8
+    VMADD4    $vr9,     $vr21,    $vr18,     $vr9
+
+    vld       $vr16,     A0,    0x50  // a5ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a5rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a5ii
+
+    VMADD1    $vr10,     $vr21,    $vr23,     $vr10  //05r 15r
+    VMADD2    $vr11,     $vr22,    $vr23,     $vr11  //05i 15i
+    VMADD3    $vr10,     $vr22,    $vr18,     $vr10
+    VMADD4    $vr11,     $vr21,    $vr18,     $vr11
+
+    vld       $vr16,     A0,    0x60  // a6ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a6rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a6ii
+
+    VMADD1    $vr12,     $vr21,    $vr23,     $vr12  //06r 16r
+    VMADD2    $vr13,     $vr22,    $vr23,     $vr13  //06i 16i
+    VMADD3    $vr12,     $vr22,    $vr18,     $vr12
+    VMADD4    $vr13,     $vr21,    $vr18,     $vr13
+
+    vld       $vr16,     A0,    0x70  // a7ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a7rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a7ii
+
+    VMADD1    $vr14,     $vr21,    $vr23,     $vr14  //07r 17r
+    VMADD2    $vr15,     $vr22,    $vr23,     $vr15  //07i 17i
+    VMADD3    $vr14,     $vr22,    $vr18,     $vr14
+    VMADD4    $vr15,     $vr21,    $vr18,     $vr15
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr19,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr2,    $vr28
+    vfmul.d      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr4,    $vr28
+    vfmul.d      $vr19,    $vr5,    $vr28
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr6,    $vr28
+    vfmul.d      $vr19,    $vr7,    $vr28
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr8,    $vr28
+    vfmul.d      $vr19,    $vr9,    $vr28
+    VNMSUB      $vr18,    $vr9,    $vr29, $vr18
+    VFMADD      $vr19,    $vr8,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res05 res15
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr10,    $vr28
+    vfmul.d      $vr19,    $vr11,    $vr28
+    VNMSUB      $vr18,    $vr11,    $vr29, $vr18
+    VFMADD      $vr19,    $vr10,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr12,    $vr28
+    vfmul.d      $vr19,    $vr13,    $vr28
+    VNMSUB      $vr18,    $vr13,    $vr29, $vr18
+    VFMADD      $vr19,    $vr12,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res07 res17
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr14,    $vr28
+    vfmul.d      $vr19,    $vr15,    $vr28
+    VNMSUB      $vr18,    $vr15,    $vr29, $vr18
+    VFMADD      $vr19,    $vr14,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr19,    $vr31,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr4,    $vr28, $vr18
+    VFMADD      $vr19,    $vr5,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr6,    $vr28, $vr18
+    VFMADD      $vr19,    $vr7,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res04 res14
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr8,    $vr28, $vr18
+    VFMADD      $vr19,    $vr9,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr9,    $vr29, $vr18
+    VFMADD      $vr19,    $vr8,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res05 res15
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr10,    $vr28, $vr18
+    VFMADD      $vr19,    $vr11,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr11,    $vr29, $vr18
+    VFMADD      $vr19,    $vr10,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res06 res16
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr12,    $vr28, $vr18
+    VFMADD      $vr19,    $vr13,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr13,    $vr29, $vr18
+    VFMADD      $vr19,    $vr12,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res07 res17
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr14,    $vr28, $vr18
+    VFMADD      $vr19,    $vr15,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr15,    $vr29, $vr18
+    VFMADD      $vr19,    $vr14,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L280
+
+.L25:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+    vxor.v    $vr2,      $vr2,    $vr2
+    vxor.v    $vr3,      $vr3,    $vr3
+    vxor.v    $vr4,      $vr4,    $vr4
+    vxor.v    $vr5,      $vr5,    $vr5
+    vxor.v    $vr6,      $vr6,    $vr6
+    vxor.v    $vr7,      $vr7,    $vr7
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+
+    vld       $vr18,     B0,    0x00  // b0ri
+    vld       $vr19,     B0,    0x10  // b1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a0rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a0ii
+
+    vand.v    $vr23,     $vr18,    $vr18
+    vshuf4i.d $vr23,     $vr19,    0x08 //b0r b1r
+    vshuf4i.d $vr18,     $vr19,    0x0d //b0i b1i
+
+    VMADD1    $vr30,     $vr21,    $vr23,     $vr30  //00r 10r
+    VMADD2    $vr31,     $vr22,    $vr23,     $vr31  //00i 10i
+    VMADD3    $vr30,     $vr22,    $vr18,     $vr30
+    VMADD4    $vr31,     $vr21,    $vr18,     $vr31
+
+    vld       $vr16,     A0,    0x10  // a1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a1rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a1ii
+
+    VMADD1    $vr2,     $vr21,    $vr23,     $vr2  //01r 11r
+    VMADD2    $vr3,     $vr22,    $vr23,     $vr3  //01i 11i
+    VMADD3    $vr2,     $vr22,    $vr18,     $vr2
+    VMADD4    $vr3,     $vr21,    $vr18,     $vr3
+
+    vld       $vr16,     A0,    0x20  // a2ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a2rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a2ii
+
+    VMADD1    $vr4,     $vr21,    $vr23,     $vr4  //02r 12r
+    VMADD2    $vr5,     $vr22,    $vr23,     $vr5  //02i 12i
+    VMADD3    $vr4,     $vr22,    $vr18,     $vr4
+    VMADD4    $vr5,     $vr21,    $vr18,     $vr5
+
+    vld       $vr16,     A0,    0x30  // a3ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a3rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a3ii
+
+    VMADD1    $vr6,     $vr21,    $vr23,     $vr6  //03r 13r
+    VMADD2    $vr7,     $vr22,    $vr23,     $vr7  //03i 13i
+    VMADD3    $vr6,     $vr22,    $vr18,     $vr6
+    VMADD4    $vr7,     $vr21,    $vr18,     $vr7
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr19,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr2,    $vr28
+    vfmul.d      $vr19,    $vr3,    $vr28
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr4,    $vr28
+    vfmul.d      $vr19,    $vr5,    $vr28
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    vfmul.d      $vr18,    $vr6,    $vr28
+    vfmul.d      $vr19,    $vr7,    $vr28
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr19,    $vr31,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr19,    $vr30,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res01 res11
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr2,    $vr28, $vr18
+    VFMADD      $vr19,    $vr3,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr3,    $vr29, $vr18
+    VFMADD      $vr19,    $vr2,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res02 res12
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr4,    $vr28, $vr18
+    VFMADD      $vr19,    $vr5,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr5,    $vr29, $vr18
+    VFMADD      $vr19,    $vr4,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    //res03 res13
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vpackev.d $vr18,     $vr17,    $vr16
+    vpackod.d $vr19,     $vr17,    $vr16
+
+    VFMADD      $vr18,    $vr6,    $vr28, $vr18
+    VFMADD      $vr19,    $vr7,    $vr28, $vr19
+    VNMSUB      $vr18,    $vr7,    $vr29, $vr18
+    VFMADD      $vr19,    $vr6,    $vr29, $vr19
+
+    vpackev.d $vr16,     $vr19,    $vr18
+    vpackod.d $vr17,     $vr19,    $vr18
+
+    vst       $vr16,     C0,    0x00 //c0: 0 1
+    vst       $vr17,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L280:   /* if ( bm & 2 )*/
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L284
+
+.L281:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L283
+    blt        TL,     L,     .L283
+
+.L282:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D2,     B0,    0x00  // b0ri b1ri
+
+    xvpermi.d  D1,     D0,    0xf5  //a0ii a1ii
+    xvpermi.d  D0,     D0,    0xa0  //a0rr a1rr
+
+    xvpermi.d  D3,     D2,    0xdd  //b0i b1i b0i b1i
+    xvpermi.d  D2,     D2,    0x88  //b0r b1r b0r b1r
+
+    XVMADD1    U0,     D0,    D2,     U0  //00r 10r 01r 11r
+    XVMADD2    U1,     D1,    D2,     U1  //00i 10i 01i 11i
+    XVMADD3    U0,     D1,    D3,     U0
+    XVMADD4    U1,     D0,    D3,     U1
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L282
+
+.L283:
+#if defined(TRMMKERNEL)
+    //res00 res10 res01 res11
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    xvpackev.d D2,     D1,    D0  //0 4 2 6
+    xvpackod.d D3,     D1,    D0  //1 5 3 7
+
+    xvfmul.d      D2,    U0,    VALPHAR
+    xvfmul.d      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvpackev.d D4,     D3,    D2  //0 1 2 3
+    xvpackod.d D5,     D3,    D2  //4 5 6 7
+
+    xvst       D4,     C0,    0x00 //c0: 0 1 2 3
+    xvst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#else
+    //res00 res10 res01 res11
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C1,    0x00 //c1: 0 1 2 3
+
+    xvpackev.d D2,     D1,    D0  //0 4 2 6
+    xvpackod.d D3,     D1,    D0  //1 5 3 7
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvpackev.d D4,     D3,    D2  //0 1 2 3
+    xvpackod.d D5,     D3,    D2  //4 5 6 7
+
+    xvst       D4,     C0,    0x00 //c0: 0 1 2 3
+    xvst       D5,     C1,    0x00 //c1: 0 1 2 3
+
+    addi.d     C0,     C0,    0x20
+    addi.d     C1,     C1,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L284:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L288
+
+.L285:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x05
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L287
+    blt        TL,     L,     .L287
+
+.L286:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+
+    vld       $vr18,     B0,    0x00  // b0ri
+    vld       $vr19,     B0,    0x10  // b1ri
+
+    vshuf4i.d  $vr21,     $vr16,    0x0a  //a0rr
+    vshuf4i.d  $vr22,     $vr16,    0x0f  //a0ii
+
+    vand.v    $vr23,     $vr18,    $vr18
+    vshuf4i.d $vr23,     $vr19,    0x08 //b0r b1r
+    vshuf4i.d $vr18,     $vr19,    0x0d //b0i b1i
+
+    VMADD1    $vr30,     $vr21,    $vr23,     $vr30  //00r 10r
+    VMADD2    $vr31,     $vr22,    $vr23,     $vr31  //00i 10i
+    VMADD3    $vr30,     $vr22,    $vr18,     $vr30
+    VMADD4    $vr31,     $vr21,    $vr18,     $vr31
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x20
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L286
+
+.L287:
+#if defined(TRMMKERNEL)
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c1[0]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c1[1]
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr16,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c1[0] c1[1]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#else
+    //res00 res10
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C1,    0x00 //c1: 0 1
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c1[0]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c1[1]
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr16,    $vr31,    $vr28, $vr16
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c1[0] c1[1]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C1,    0x00 //c1: 0 1
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x05
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L288:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+    slli.d     L,      K,     5
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   2
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     2
+    andi       T0,     N,     2
+    blt        J,      T0,    .L20
+
+.L30:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L999
+
+.L300:  /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     3  //bm/8
+    beq        I,      T0,    .L34
+
+.L31:  /* for (i=0; i<bm/8; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,  0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     OFF,  0x04
+    add.d      B0,     B,    T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   8
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+    xvxor.v    U2,     U2,   U2
+    xvxor.v    U3,     U3,   U3
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L33
+    blt        TL,     L,     .L33
+
+.L32:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D1,     A0,    0x20  // a2ri a3ri
+
+    xvldrepl.d D2,     B0,    0x00 //b0r
+    xvldrepl.d D3,     B0,    0x08 //b0i
+
+    xvpackev.d D4,     D1,    D0
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    xvpackod.d D5,     D1,    D0
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    xvld       D0,     A0,    0x40  // a4ri a5ri
+    xvld       D1,     A0,    0x60  // a6ri a7ri
+
+    xvpackev.d D4,     D1,    D0
+    xvpermi.d  D4,     D4,    0xd8  //a4r a5r a6r a7r
+
+    xvpackod.d D5,     D1,    D0
+    xvpermi.d  D5,     D5,    0xd8  //a4i a5i a6i a7i
+
+    XVMADD1    U2,     D4,    D2,     U2  //04r 05r 06r 07r
+    XVMADD2    U3,     D5,    D2,     U3  //04i 05i 06i 07i
+    XVMADD3    U2,     D5,    D3,     U2
+    XVMADD4    U3,     D4,    D3,     U3
+
+    addi.d     A0,     A0,    0x80
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L32
+
+.L33:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    xvfmul.d      D2,    U0,    VALPHAR
+    xvfmul.d      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    //res04 res05 res06 res07
+    xvld       D0,     C0,    0x40 //c0: 8 9 10 11
+    xvld       D1,     C0,    0x60 //c0: 12 13 14 15
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //8 10 12 14
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //9 11 13 15
+
+    xvfmul.d      D2,    U2,    VALPHAR
+    xvfmul.d      D3,    U3,    VALPHAR
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //8 10 9 11
+    xvpermi.d  D4,     D4,   0xd8 //8 9 10 11
+
+    xvand.v    D5,     D3,   D3  //9 11 13 15
+    xvpermi.q  D5,     D2,   0x31 //12 14 13 15
+    xvpermi.d  D5,     D5,   0xd8 //12 13 14 15
+
+    xvst       D4,     C0,    0x40
+    xvst       D5,     C0,    0x60
+
+    addi.d     C0,     C0,    0x80
+#else
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    //res04 res05 res06 res07
+    xvld       D0,     C0,    0x40 //c0: 8 9 10 11
+    xvld       D1,     C0,    0x60 //c0: 12 13 14 15
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //8 10 12 14
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //9 11 13 15
+
+    XVFMADD      D2,    U2,    VALPHAR, D2
+    XVFMADD      D3,    U3,    VALPHAR, D3
+    XVNMSUB      D2,    U3,    VALPHAI, D2
+    XVFMADD      D3,    U2,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //8 10 12 14
+    xvpermi.q  D4,     D3,   0x02 //8 10 9 11
+    xvpermi.d  D4,     D4,   0xd8 //8 9 10 11
+
+    xvand.v    D5,     D3,   D3  //9 11 13 15
+    xvpermi.q  D5,     D2,   0x31 //12 14 13 15
+    xvpermi.d  D5,     D5,   0xd8 //12 13 14 15
+
+    xvst       D4,     C0,    0x40
+    xvst       D5,     C0,    0x60
+
+    addi.d     C0,     C0,    0x80
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -8
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x07
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   8
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L31
+
+.L34:   /* if ( bm & 4 ) */
+    move       I,      $r0
+    andi       T1,     M,     4    //bm&4
+    beq        I,      T1,    .L38
+
+.L35:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x06
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   4
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L37
+    blt        TL,     L,     .L37
+
+.L36:  /* for (k=0; k<temp; k++) */
+    xvld       D0,     A0,    0x00  // a0ri a1ri
+    xvld       D1,     A0,    0x20  // a2ri a3ri
+
+    xvldrepl.d D2,     B0,    0x00 //b0r
+    xvldrepl.d D3,     B0,    0x08 //b0i
+
+    xvpackev.d D4,     D1,    D0
+    xvpermi.d  D4,     D4,    0xd8  //a0r a1r a2r a3r
+
+    xvpackod.d D5,     D1,    D0
+    xvpermi.d  D5,     D5,    0xd8  //a0i a1i a2i a3i
+
+    XVMADD1    U0,     D4,    D2,     U0  //00r 01r 02r 03r
+    XVMADD2    U1,     D5,    D2,     U1  //00i 01i 02i 03i
+    XVMADD3    U0,     D5,    D3,     U0
+    XVMADD4    U1,     D4,    D3,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L36
+
+.L37:
+#if defined(TRMMKERNEL)
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    xvfmul.d      D2,    U0,    VALPHAR
+    xvfmul.d      D3,    U1,    VALPHAR
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#else
+    //res00 res01 res02 res03
+    xvld       D0,     C0,    0x00 //c0: 0 1 2 3
+    xvld       D1,     C0,    0x20 //c0: 4 5 6 7
+
+    xvpackev.d D2,     D1,    D0
+    xvpermi.d  D2,     D2,    0xd8  //0 2 4 6
+    xvpackod.d D3,     D1,    D0
+    xvpermi.d  D3,     D3,    0xd8  //1 3 5 7
+
+    XVFMADD      D2,    U0,    VALPHAR, D2
+    XVFMADD      D3,    U1,    VALPHAR, D3
+    XVNMSUB      D2,    U1,    VALPHAI, D2
+    XVFMADD      D3,    U0,    VALPHAI, D3
+
+    xvand.v    D4,     D2,   D2  //0 2 4 6
+    xvpermi.q  D4,     D3,   0x02 //0 2 1 3
+    xvpermi.d  D4,     D4,   0xd8 //0 1 2 3
+
+    xvand.v    D5,     D3,   D3  //1 3 5 7
+    xvpermi.q  D5,     D2,   0x31 //4 6 5 7
+    xvpermi.d  D5,     D5,   0xd8 //4 5 6 7
+
+    xvst       D4,     C0,    0x00
+    xvst       D5,     C0,    0x20
+
+    addi.d     C0,     C0,    0x40
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -4
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x06
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   4
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+.L38:   /* if ( bm & 2 ) */
+    move       I,      $r0
+    andi       T1,     M,     2    //bm&2
+    beq        I,      T1,    .L312
+
+.L39:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x05
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    $vr30,     $vr30,   $vr30
+    vxor.v    $vr31,     $vr31,   $vr31
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L311
+    blt        TL,     L,     .L311
+
+.L310:  /* for (k=0; k<temp; k++) */
+    vld       $vr16,     A0,    0x00  // a0ri
+    vld       $vr17,     A0,    0x10  // a1ri
+
+    //vldrepl.d $vr18,     B0,    0x00 //b0rr
+    //vldrepl.d $vr19,     B0,    0x08 //b0ii
+    vld       $vr18,     B0,    0x00
+    vld       $vr19,     B0,    0x00
+    vshuf4i.d  $vr18,     $vr18,    0x00 //b0rr
+    vshuf4i.d  $vr19,     $vr19,    0x05 //b0ii
+
+    vand.v     $vr20,     $vr16,    $vr16
+    vshuf4i.d  $vr20,     $vr17,    0x08  //a0r a1r
+    vshuf4i.d  $vr16,     $vr17,    0x0d  //a0i a1i
+
+    VMADD1    $vr30,     $vr20,    $vr18,     $vr30  //00r 01r
+    VMADD2    $vr31,     $vr16,    $vr18,     $vr31  //00i 01i
+    VMADD3    $vr30,     $vr16,    $vr19,     $vr30
+    VMADD4    $vr31,     $vr20,    $vr19,     $vr31
+
+    addi.d     A0,     A0,    0x20
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L310
+
+.L311:
+#if defined(TRMMKERNEL)
+    //res00 res01
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C0,    0x10 //c0: 2 3
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c0[2]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c0[3]
+
+    vfmul.d      $vr18,    $vr30,    $vr28
+    vfmul.d      $vr16,    $vr31,    $vr28
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c0[2] c0[3]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C0,    0x10 //c1: 2 3
+
+    addi.d     C0,     C0,    0x20
+#else
+    //res00 res01
+    vld       $vr16,     C0,    0x00 //c0: 0 1
+    vld       $vr17,     C0,    0x10 //c0: 2 3
+
+    vand.v    $vr18,     $vr16,    $vr16
+    vshuf4i.d $vr18,     $vr17,    0x08 //c0[0] c0[2]
+    vshuf4i.d $vr16,     $vr17,    0x0d //c0[1] c0[3]
+
+    VFMADD      $vr18,    $vr30,    $vr28, $vr18
+    VFMADD      $vr16,    $vr31,    $vr28, $vr16
+    VNMSUB      $vr18,    $vr31,    $vr29, $vr18
+    VFMADD      $vr16,    $vr30,    $vr29, $vr16
+
+    vand.v    $vr19,     $vr18,    $vr18
+    vshuf4i.d $vr19,     $vr16,    0x08 //c0[0] c0[1]
+    vshuf4i.d $vr18,     $vr16,    0x0d //c0[2] c0[3]
+
+    vst       $vr19,     C0,    0x00 //c0: 0 1
+    vst       $vr18,     C0,    0x10 //c1: 2 3
+
+    addi.d     C0,     C0,    0x20
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x05
+    add.d      A0,     A0,   T3
+    slli.d     T3,     TL,   0x04
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L312:   /* if ( bm & 1 )*/
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L316
+
+.L313:
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     T3,     OFF,   0x04
+    add.d      A0,     A0,    T3
+    slli.d     T3,     OFF,   0x04
+    add.d      B0,     B,     T3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L315
+    blt        TL,     L,     .L315
+
+.L314:  /* for (k=0; k<temp; k++) */
+    LD         a1,     A0,    0x00
+    LD         a2,     A0,    0x08
+
+    LD         b1,     B0,    0x00
+    LD         b2,     B0,    0x08
+
+    MADD1      c11,    a1,    b1,     c11
+    MADD2      c12,    a2,    b1,     c12
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L314
+
+.L315:
+#if defined(TRMMKERNEL)
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MUL       a5,     c11,   ALPHA_R
+    MUL       a6,     c12,   ALPHA_R
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    addi.d     C0,     C0,    0x10
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x08    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x08
+
+    addi.d     C0,     C0,    0x10
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     T3,     TL,   0x04
+    add.d      A0,     A0,   T3
+    add.d      B0,     B0,   T3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+.L316:
+    slli.d     L,      K,     4
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L300
+
+.L999:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_2_lasx.S b/kernel/loongarch64/zgemm_ncopy_2_lasx.S
new file mode 100644
index 000000000..4bc1052a9
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_2_lasx.S
@@ -0,0 +1,196 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x03  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+
+    xvpermi.q  D0,   U2,   0x02
+    xvpermi.q  D2,   U0,   0x31
+    xvpermi.q  D1,   U3,   0x02
+    xvpermi.q  D3,   U1,   0x31
+
+    xvst       D0,   TD,   0x00
+    xvst       D2,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x40   // a_offset1
+    addi.d     S2,   S2,   0x40
+    addi.d     TD,   TD,   0x80  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+
+    vst        $vr0, TD,   0x00
+    vst        $vr1, TD,   0x10
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    xvld       U0,    TS,   0x00
+    xvld       U1,    TS,   0x20
+
+    xvst       U0,    TD,   0x00
+    xvst       U1,    TD,   0x20
+
+    addi.d     TS,    TS,   0x40   // a_offset
+    addi.d     TD,    TD,   0x40   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    vld        $vr0,  TS,   0x00
+    vst        $vr0,  TD,   0x00
+
+
+    addi.d     TS,    TS,   0x10   // a_offset
+    addi.d     TD,    TD,   0x10   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_4_lasx.S b/kernel/loongarch64/zgemm_ncopy_4_lasx.S
new file mode 100644
index 000000000..5d874bcc1
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_4_lasx.S
@@ -0,0 +1,320 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x03
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+    xvld       U4,   S3,   0x00
+    xvld       U5,   S3,   0x20
+    xvld       U6,   S4,   0x00
+    xvld       U7,   S4,   0x20
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+    xvand.v    D4,   U4,   U4
+    xvand.v    D5,   U5,   U5
+    xvand.v    D6,   U6,   U6
+    xvand.v    D7,   U7,   U7
+
+    xvpermi.q  D0,   U2,   0x02
+    xvpermi.q  D4,   U6,   0x02
+    xvpermi.q  D2,   U0,   0x31
+    xvpermi.q  D6,   U4,   0x31
+    xvpermi.q  D1,   U3,   0x02
+    xvpermi.q  D5,   U7,   0x02
+    xvpermi.q  D3,   U1,   0x31
+    xvpermi.q  D7,   U5,   0x31
+
+    xvst       D0,   TD,   0x00
+    xvst       D4,   TD,   0x20
+    xvst       D2,   TD,   0x40
+    xvst       D6,   TD,   0x60
+    xvst       D1,   TD,   0x80
+    xvst       D5,   TD,   0xa0
+    xvst       D3,   TD,   0xc0
+    xvst       D7,   TD,   0xe0
+
+    addi.d     S1,   S1,   0x40   // a_offset
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     TD,   TD,   0x100  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+.L_II1:  /* if(m&2) */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvand.v    D0,   U0,   U0
+    xvand.v    D1,   U1,   U1
+    xvand.v    D2,   U2,   U2
+    xvand.v    D3,   U3,   U3
+
+    xvpermi.q  D0,   U1,   0x02
+    xvpermi.q  D2,   U3,   0x02
+    xvpermi.q  D1,   U0,   0x31
+    xvpermi.q  D3,   U2,   0x31
+
+    xvst       D0,   TD,   0x00
+    xvst       D2,   TD,   0x20
+    xvst       D1,   TD,   0x40
+    xvst       D3,   TD,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+.L_II20:
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+.L_II2:  /* if(m&1) */
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+    vld        $vr2, S3,   0x00
+    vld        $vr3, S4,   0x00
+
+    vst        $vr0, TD,   0x00
+    vst        $vr1, TD,   0x10
+    vst        $vr2, TD,   0x20
+    vst        $vr3, TD,   0x30
+
+    addi.d     TD,   TD,   0x40
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    xvld       U0,    S1,   0x00
+    xvld       U1,    S1,   0x20
+    xvld       U2,    S2,   0x00
+    xvld       U3,    S2,   0x20
+
+    xvand.v    D0,    U0,   U0
+    xvand.v    D1,    U1,   U1
+    xvand.v    D2,    U2,   U2
+    xvand.v    D3,    U3,   U3
+
+    xvpermi.q  D0,    U2,   0x02
+    xvpermi.q  D2,    U0,   0x31
+    xvpermi.q  D1,    U3,   0x02
+    xvpermi.q  D3,    U1,   0x31
+
+    xvst       D0,    TD,   0x00
+    xvst       D2,    TD,   0x20
+    xvst       D1,    TD,   0x40
+    xvst       D3,    TD,   0x60
+
+    addi.d     S1,    S1,   0x40   // a_offset
+    addi.d     S2,    S2,   0x40
+    addi.d     TD,    TD,   0x80   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    xvld       U0,    S1,   0x00
+    xvld       U1,    S2,   0x00
+    xvand.v    D0,    U0,   U0
+
+    xvpermi.q  D0,    U1,   0x02
+    xvpermi.q  U1,    U0,   0x31
+
+    xvst       D0,    TD,   0x00
+    xvst       U1,    TD,   0x20
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    vld       $vr0,    S1,   0x00
+    vld       $vr1,    S2,   0x00
+
+    vst       $vr0,    TD,   0x00
+    vst       $vr1,    TD,   0x10
+
+    addi.d     TD,    TD,   0x20
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+
+    xvst       U0,   TD,   0x00
+    xvst       U1,   TD,   0x20
+
+    addi.d     S1,   S1,   0x40   // aoffset1
+    addi.d     TD,   TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    xvld       U0,   S1,   0x00
+    xvst       U0,   TD,   0x00
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    vld       $vr0,   S1,   0x00
+    vst       $vr0,   TD,   0x00
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_4_lsx.S b/kernel/loongarch64/zgemm_ncopy_4_lsx.S
new file mode 100644
index 000000000..203471cbd
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_4_lsx.S
@@ -0,0 +1,332 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define TD     $r20
+#define TS     $r11
+#define TL     $r19
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x03
+    slli.d     TL,   TL,   0x01
+
+    srai.d     J,    N,    0x02
+    beq        J,    ZERO,  .L_N0
+
+.L_J1: /* J-- */
+    move       S1,   TS
+    add.d      S2,   S1,   TL
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+
+    slli.d     T0,   TL,   0x02
+    add.d      TS,   TS,   T0
+
+    srai.d     I,    M,    0x02
+    beq        I,    ZERO,  .L_I3
+
+.L_I1: /* I-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S1,   0x20
+    vld       U3,   S1,   0x30
+
+    vld       U4,   S2,   0x00
+    vld       U5,   S2,   0x10
+    vld       U6,   S2,   0x20
+    vld       U7,   S2,   0x30
+
+    vld       U8,   S3,   0x00
+    vld       U9,   S3,   0x10
+    vld       U10,  S3,   0x20
+    vld       U11,  S3,   0x30
+
+    vld       U12,  S4,   0x00
+    vld       U13,  S4,   0x10
+    vld       U14,  S4,   0x20
+    vld       U15,  S4,   0x30
+
+    vst       U0,   TD,   0x00
+    vst       U4,   TD,   0x10
+    vst       U8,   TD,   0x20
+    vst       U12,  TD,   0x30
+
+    vst       U1,   TD,   0x40
+    vst       U5,   TD,   0x50
+    vst       U9,   TD,   0x60
+    vst       U13,  TD,   0x70
+
+    vst       U2,   TD,   0x80
+    vst       U6,   TD,   0x90
+    vst       U10,  TD,   0xa0
+    vst       U14,  TD,   0xb0
+
+    vst       U3,   TD,   0xc0
+    vst       U7,   TD,   0xd0
+    vst       U11,  TD,   0xe0
+    vst       U15,  TD,   0xf0
+
+    addi.d     S1,   S1,   0x40   // a_offset
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    addi.d     TD,   TD,   0x100  // b_offset
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_I3:  /* if(m&2) */
+    andi       I,    M,    0x02
+    beq        I,    ZERO, .L_II20
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vst       U0,   TD,   0x00
+    vst       U2,   TD,   0x10
+    vst       U4,   TD,   0x20
+    vst       U6,   TD,   0x30
+
+    vst       U1,   TD,   0x40
+    vst       U3,   TD,   0x50
+    vst       U5,   TD,   0x60
+    vst       U7,   TD,   0x70
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     TD,   TD,   0x80
+
+.L_II20:  /* if(m&1) */
+    andi       I,    M,    0x01
+    beq        I,    ZERO, .L_J0
+
+    vld        U0,   S1,   0x00
+    vld        U1,   S2,   0x00
+    vld        U2,   S3,   0x00
+    vld        U3,   S4,   0x00
+
+    vst        U0,   TD,   0x00
+    vst        U1,   TD,   0x10
+    vst        U2,   TD,   0x20
+    vst        U3,   TD,   0x30
+
+    addi.d     TD,   TD,   0x40
+
+.L_J0:
+    addi.d    J,     J,    -1
+    blt       ZERO,  J,    .L_J1
+
+.L_N0:  /* if(n&2) */
+    andi       I,     N,   0x02
+    beq        ZERO,  I,   .L_N20
+
+    move       S1,    TS
+    add.d      S2,    S1,   TL
+
+    slli.d     T0,    TL,   0x01
+    add.d      TS,    TS,   T0
+
+    srai.d     I,     M,    0x02
+    beq        ZERO,  I,    .L_N10
+
+.L_N11: /* if(i>0) */
+    vld       U0,    S1,   0x00
+    vld       U1,    S1,   0x10
+    vld       U2,    S1,   0x20
+    vld       U3,    S1,   0x30
+
+    vld       U4,    S2,   0x00
+    vld       U5,    S2,   0x10
+    vld       U6,    S2,   0x20
+    vld       U7,    S2,   0x30
+
+    vst       U0,    TD,   0x00
+    vst       U4,    TD,   0x10
+    vst       U1,    TD,   0x20
+    vst       U5,    TD,   0x30
+
+    vst       U2,    TD,   0x40
+    vst       U6,    TD,   0x50
+    vst       U3,    TD,   0x60
+    vst       U7,    TD,   0x70
+
+    addi.d     S1,    S1,   0x40   // a_offset
+    addi.d     S2,    S2,   0x40
+    addi.d     TD,    TD,   0x80   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N11
+
+.L_N10:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N130
+
+    vld       U0,    S1,   0x00
+    vld       U1,    S1,   0x10
+
+    vld       U2,    S2,   0x00
+    vld       U3,    S2,   0x10
+
+    vst       U0,    TD,   0x00
+    vst       U2,    TD,   0x10
+    vst       U1,    TD,   0x20
+    vst       U3,    TD,   0x30
+
+    addi.d     S1,    S1,   0x20   // a_offset
+    addi.d     S2,    S2,   0x20
+    addi.d     TD,    TD,   0x40   // b_offset
+
+.L_N130:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N20
+
+    vld       U0,     S1,   0x00
+    vld       U1,     S2,   0x00
+
+    vst       U0,     TD,   0x00
+    vst       U1,     TD,   0x10
+
+    addi.d     TD,    TD,   0x20
+
+.L_N20:   /* if(n&1) */
+    andi       I,     N,    0x01
+    beq        I,     ZERO, .L_N00
+
+    move       S1,   TS
+    srai.d     I,    M,    0x02
+
+    beq        I,    ZERO, .L_N30
+
+.L_N21:  /* if(i>0) */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S1,   0x20
+    vld       U3,   S1,   0x30
+
+    vst       U0,   TD,   0x00
+    vst       U1,   TD,   0x10
+    vst       U2,   TD,   0x20
+    vst       U3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x40   // aoffset1
+    addi.d     TD,   TD,   0x40   // b_offset
+
+    addi.d     I,     I,   -1
+    blt        ZERO,  I,   .L_N21
+
+.L_N30:  /* if(m&2) */
+    andi       I,     M,    0x02
+    beq        I,     ZERO, .L_N330
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vst       U0,   TD,   0x00
+    vst       U1,   TD,   0x10
+
+    addi.d     S1,   S1,   0x20   // aoffset1
+    addi.d     TD,   TD,   0x20   // b_offset
+
+.L_N330:  /* if(m&1) */
+    andi       I,     M,    0x01
+    beq        I,     ZERO, .L_N00
+
+    vld       U0,   S1,   0x00
+
+    vst       U0,   TD,   0x00
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_ncopy_8_lasx.S b/kernel/loongarch64/zgemm_ncopy_8_lasx.S
new file mode 100644
index 000000000..7cd8f605b
--- /dev/null
+++ b/kernel/loongarch64/zgemm_ncopy_8_lasx.S
@@ -0,0 +1,263 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST  //boffset
+    move       TS,   SRC  //aoffset
+
+    slli.d     TL,   LDA,  0x03  //lda
+    slli.d     TL,   TL,   0x01
+
+    slli.d     T0,   TL,   0x03
+    srai.d     J,    N,    0x03  //j
+
+    beq        J,    ZERO, .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      S5,   S4,   TL
+    add.d      S6,   S5,   TL
+    add.d      S7,   S6,   TL
+    add.d      S8,   S7,   TL
+    add.d      TS,   TS,   T0
+
+    beq        I,    ZERO, .L_J11
+
+.L_I1:  /* if(i>0) i--*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+    fld.d      F2,   S2,   0x00
+    fld.d      F3,   S2,   0x08
+    fld.d      F4,   S3,   0x00
+    fld.d      F5,   S3,   0x08
+    fld.d      F6,   S4,   0x00
+    fld.d      F7,   S4,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+    fst.d      F2,   TD,   0x10
+    fst.d      F3,   TD,   0x18
+    fst.d      F4,   TD,   0x20
+    fst.d      F5,   TD,   0x28
+    fst.d      F6,   TD,   0x30
+    fst.d      F7,   TD,   0x38
+
+    fld.d      F0,   S5,   0x00
+    fld.d      F1,   S5,   0x08
+    fld.d      F2,   S6,   0x00
+    fld.d      F3,   S6,   0x08
+    fld.d      F4,   S7,   0x00
+    fld.d      F5,   S7,   0x08
+    fld.d      F6,   S8,   0x00
+    fld.d      F7,   S8,   0x08
+
+    fst.d      F0,   TD,   0x40
+    fst.d      F1,   TD,   0x48
+    fst.d      F2,   TD,   0x50
+    fst.d      F3,   TD,   0x58
+    fst.d      F4,   TD,   0x60
+    fst.d      F5,   TD,   0x68
+    fst.d      F6,   TD,   0x70
+    fst.d      F7,   TD,   0x78
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     S5,   S5,   0x10
+    addi.d     S6,   S6,   0x10
+    addi.d     S7,   S7,   0x10
+    addi.d     S8,   S8,   0x10
+    addi.d     TD,   TD,   0x80
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_I1
+
+.L_J11: /* j--*/
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_J1
+
+.L_N1:  /* if(n&4)*/
+    andi       I,     N,    0x04
+    beq        I,     ZERO, .L_N2
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      S3,   S2,   TL
+    add.d      S4,   S3,   TL
+    add.d      TS,   S4,   TL
+
+    beq        I,     ZERO, .L_N2
+
+.L_N11:  /* if(i>0)*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+    fld.d      F2,   S2,   0x00
+    fld.d      F3,   S2,   0x08
+    fld.d      F4,   S3,   0x00
+    fld.d      F5,   S3,   0x08
+    fld.d      F6,   S4,   0x00
+    fld.d      F7,   S4,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+    fst.d      F2,   TD,   0x10
+    fst.d      F3,   TD,   0x18
+    fst.d      F4,   TD,   0x20
+    fst.d      F5,   TD,   0x28
+    fst.d      F6,   TD,   0x30
+    fst.d      F7,   TD,   0x38
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     S3,   S3,   0x10
+    addi.d     S4,   S4,   0x10
+    addi.d     TD,   TD,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N11
+
+.L_N2:  /* if(n&2)*/
+    andi       I,     N,    0x02
+    beq        I,     ZERO, .L_N3
+
+    move       S1,   TS
+    add.d      S2,   TS,   TL
+    move       I,    M
+    add.d      TS,   S2,   TL
+
+    beq        I,    ZERO, .L_N3
+
+.L_N21:  /* if(i>0)*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+    fld.d      F2,   S2,   0x00
+    fld.d      F3,   S2,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+    fst.d      F2,   TD,   0x10
+    fst.d      F3,   TD,   0x18
+
+    addi.d     S1,   S1,   0x10
+    addi.d     S2,   S2,   0x10
+    addi.d     TD,   TD,   0x20
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N21
+
+.L_N3:  /* if(n&2)*/
+    andi       I,    N,    0x01
+    beq        I,    ZERO, .L_N0
+
+    move       S1,   TS
+    move       I,    M
+
+    beq        I,    ZERO, .L_N0
+
+.L_N31:  /* if(i>0)*/
+    fld.d      F0,   S1,   0x00
+    fld.d      F1,   S1,   0x08
+
+    fst.d      F0,   TD,   0x00
+    fst.d      F1,   TD,   0x08
+
+    addi.d     S1,   S1,   0x10
+    addi.d     TD,   TD,   0x10
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_N31
+
+.L_N0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_2_lasx.S b/kernel/loongarch64/zgemm_tcopy_2_lasx.S
new file mode 100644
index 000000000..3fe17beef
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_2_lasx.S
@@ -0,0 +1,212 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r7
+#define T0     $r18
+#define S8     $r19
+#define S9     $r20
+#define S10    $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S9,     DST,   T0  //boffset2
+
+    srai.d     J,      M,     0x01  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01
+    add.d      S2,     S1,    TL   //aoffset2
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x40
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+
+    xvst       U0,   S8,   0x00
+    xvst       U2,   S8,   0x20
+
+    slli.d     T0,   M,    0x05
+    add.d      S8,   S8,   T0
+
+    xvst       U1,   S8,   0x00
+    xvst       U3,   S8,   0x20
+
+    add.d      S8,   S8,   T0
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+
+    xvst       U0,   S8,   0x00
+    xvst       U1,   S8,   0x20
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+
+    vst        $vr0, S9,   0x00
+    vst        $vr1, S9,   0x10
+
+    addi.d     S9,   S9,   0x20
+
+.L_J0:
+    addi.d     J,    J,    -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    xvld       U0,     TS,    0x00
+    xvld       U1,     TS,    0x20
+
+    xvst       U0,     TD,    0x00
+
+    slli.d     T0,     M,     0x05
+    add.d      TD,     TD,    T0
+
+    xvst       U1,     TD,    0x00
+
+    add.d      TD,     TD,    T0
+    addi.d     TS,     TS,    0x40
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    xvld       U0,     TS,    0x00
+
+    xvst       U0,     TD,    0x00
+
+    addi.d     TS,     TS,    0x20
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    vld        $vr0, TS,   0x00
+
+    vst        $vr0, S9,   0x00
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_4_lasx.S b/kernel/loongarch64/zgemm_tcopy_4_lasx.S
new file mode 100644
index 000000000..1adee11c5
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_4_lasx.S
@@ -0,0 +1,302 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x100
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x20
+    xvld       U2,   S2,   0x00
+    xvld       U3,   S2,   0x20
+    xvld       U4,   S3,   0x00
+    xvld       U5,   S3,   0x20
+    xvld       U6,   S4,   0x00
+    xvld       U7,   S4,   0x20
+
+    xvst       U0,   S8,   0x00
+    xvst       U1,   S8,   0x20
+    xvst       U2,   S8,   0x40
+    xvst       U3,   S8,   0x60
+    xvst       U4,   S8,   0x80
+    xvst       U5,   S8,   0xa0
+    xvst       U6,   S8,   0xc0
+    xvst       U7,   S8,   0xe0
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    slli.d     T0,   M,    0x06
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S2,   0x00
+    xvld       U2,   S3,   0x00
+    xvld       U3,   S4,   0x00
+
+    xvst       U0,   S9,   0x00
+    xvst       U1,   S9,   0x20
+    xvst       U2,   S9,   0x40
+    xvst       U3,   S9,   0x60
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     S9,   S9,   0x80
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    vld        $vr0, S1,   0x00
+    vld        $vr1, S2,   0x00
+    vld        $vr2, S3,   0x00
+    vld        $vr3, S4,   0x00
+
+    vst        $vr0, S10,   0x00
+    vst        $vr1, S10,   0x10
+    vst        $vr2, S10,   0x20
+    vst        $vr3, S10,   0x30
+
+    addi.d     S10,  S10,  0x40
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     S8,   0x00
+    xvst       U1,     S8,   0x20
+    xvst       U2,     S8,   0x40
+    xvst       U3,     S8,   0x60
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     S9,    0x00
+    xvst       U1,     S9,    0x20
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S9,     S9,    0x40
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    vld       $vr0,     S1,    0x00
+    vld       $vr1,     S2,    0x00
+
+    vst       $vr0,     S10,    0x00
+    vst       $vr1,     S10,    0x10
+
+    addi.d     S10,    S10,   0x20
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     S8,    0x00
+    xvst       U1,     S8,    0x20
+
+    addi.d     S1,     S1,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     S9,    0x00
+
+    addi.d     S1,     S1,    0x20
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    vld       $vr0,     S1,    0x00
+
+    vst       $vr0,     S10,    0x00
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_4_lsx.S b/kernel/loongarch64/zgemm_tcopy_4_lsx.S
new file mode 100644
index 000000000..954753eaf
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_4_lsx.S
@@ -0,0 +1,355 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define TD     $r16
+#define TS     $r17
+#define TL     $r18
+#define T0     $r19
+#define S8     $r20
+#define S9     $r23
+#define S10    $r11
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LSX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01  //lda
+
+    ori        T0,     ZERO,  0x03
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S9,     DST,   T0  //boffset2
+
+    ori        T0,     ZERO,  0x01
+    andn       T0,     N,     T0
+    mul.d      T0,     M,     T0
+    slli.d     T0,     T0,    0x01
+    slli.d     T0,     T0,    0x03
+    add.d      S10,    DST,   T0  //boffset3
+
+    srai.d     J,      M,     0x02  //j
+
+    beq        J,      ZERO,  .L_M1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+    add.d      S3,     S2,    TL
+    add.d      S4,     S3,    TL
+
+    slli.d     T0,     TL,    0x02
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x100
+
+    srai.d     I,      N,     0x02
+
+    beq        ZERO,   I,     .L_JN1
+
+.L_JI1:  /* if(i>0) i--*/
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S1,   0x20
+    vld       U3,   S1,   0x30
+
+    vld       U4,   S2,   0x00
+    vld       U5,   S2,   0x10
+    vld       U6,   S2,   0x20
+    vld       U7,   S2,   0x30
+
+    vld       U8,   S3,   0x00
+    vld       U9,   S3,   0x10
+    vld       U10,  S3,   0x20
+    vld       U11,  S3,   0x30
+
+    vld       U12,  S4,   0x00
+    vld       U13,  S4,   0x10
+    vld       U14,  S4,   0x20
+    vld       U15,  S4,   0x30
+
+    vst       U0,   S8,   0x00
+    vst       U1,   S8,   0x10
+    vst       U2,   S8,   0x20
+    vst       U3,   S8,   0x30
+    vst       U4,   S8,   0x40
+    vst       U5,   S8,   0x50
+    vst       U6,   S8,   0x60
+    vst       U7,   S8,   0x70
+
+    vst       U8,   S8,   0x80
+    vst       U9,   S8,   0x90
+    vst       U10,  S8,   0xa0
+    vst       U11,  S8,   0xb0
+    vst       U12,  S8,   0xc0
+    vst       U13,  S8,   0xd0
+    vst       U14,  S8,   0xe0
+    vst       U15,  S8,   0xf0
+
+    addi.d     S1,   S1,   0x40
+    addi.d     S2,   S2,   0x40
+    addi.d     S3,   S3,   0x40
+    addi.d     S4,   S4,   0x40
+    slli.d     T0,   M,    0x06
+    add.d      S8,   S8,   T0
+
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_JI1
+
+.L_JN1:  /* if(n&2) */
+    andi       I,    N,    0x02
+    beq        ZERO, I,    .L_JN2
+
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vld       U4,   S3,   0x00
+    vld       U5,   S3,   0x10
+
+    vld       U6,   S4,   0x00
+    vld       U7,   S4,   0x10
+
+    vst       U0,   S9,   0x00
+    vst       U1,   S9,   0x10
+    vst       U2,   S9,   0x20
+    vst       U3,   S9,   0x30
+
+    vst       U4,   S9,   0x40
+    vst       U5,   S9,   0x50
+    vst       U6,   S9,   0x60
+    vst       U7,   S9,   0x70
+
+    addi.d     S1,   S1,   0x20
+    addi.d     S2,   S2,   0x20
+    addi.d     S3,   S3,   0x20
+    addi.d     S4,   S4,   0x20
+    addi.d     S9,   S9,   0x80
+
+.L_JN2:  /* if(n&1) */
+    andi       I,    N,    0x01
+    beq        ZERO, I,    .L_J0
+
+    vld        U0,   S1,   0x00
+    vld        U1,   S2,   0x00
+    vld        U2,   S3,   0x00
+    vld        U3,   S4,   0x00
+
+    vst        U0,   S10,   0x00
+    vst        U1,   S10,   0x10
+    vst        U2,   S10,   0x20
+    vst        U3,   S10,   0x30
+
+    addi.d     S10,  S10,  0x40
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO, J,   .L_J1
+
+.L_M1:  /* if(m&2) */
+    andi       I,      M,    0x02
+    beq        ZERO,   I,    .L_M2
+
+    move       S1,     TS     //aoffset1
+    add.d      S2,     S1,    TL
+
+    slli.d     T0,     TL,    0x01
+    add.d      TS,     TS,    T0
+
+    move       S8,     TD     //boffset1
+    addi.d     TD,     TD,    0x80
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M1N1
+
+.L_M1I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vld       U4,     S2,    0x00
+    vld       U5,     S2,    0x10
+    vld       U6,     S2,    0x20
+    vld       U7,     S2,    0x30
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+    vst       U2,     S8,    0x20
+    vst       U3,     S8,    0x30
+
+    vst       U4,     S8,    0x40
+    vst       U5,     S8,    0x50
+    vst       U6,     S8,    0x60
+    vst       U7,     S8,    0x70
+
+    addi.d     S1,     S1,    0x40
+    addi.d     S2,     S2,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M1I1
+
+.L_M1N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M1N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vld       U2,     S2,    0x00
+    vld       U3,     S2,    0x10
+
+    vst       U0,     S9,    0x00
+    vst       U1,     S9,    0x10
+    vst       U2,     S9,    0x20
+    vst       U3,     S9,    0x30
+
+    addi.d     S1,     S1,    0x20
+    addi.d     S2,     S2,    0x20
+    addi.d     S9,     S9,    0x40
+
+.L_M1N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M2
+
+    vld       U0,      S1,    0x00
+    vld       U1,      S2,    0x00
+
+    vst       U0,      S10,   0x00
+    vst       U1,      S10,   0x10
+
+    addi.d     S10,    S10,   0x20
+
+.L_M2:  /* if(m&1) */
+    andi       I,      M,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    move       S1,     TS     //aoffset1
+    move       S8,     TD     //boffset1
+
+    srai.d     I,      N,     0x02
+    beq        ZERO,   I,     .L_M2N1
+
+.L_M2I1:  /* if(i>0) */
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+    vld       U2,     S1,    0x20
+    vld       U3,     S1,    0x30
+
+    vst       U0,     S8,    0x00
+    vst       U1,     S8,    0x10
+    vst       U2,     S8,    0x20
+    vst       U3,     S8,    0x30
+
+    addi.d     S1,     S1,    0x40
+    slli.d     T0,     M,     0x06
+    add.d      S8,     S8,    T0
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_M2I1
+
+.L_M2N1:  /* if(n&2) */
+    andi       I,      N,    0x02
+    beq        ZERO,   I,    .L_M2N2
+
+    vld       U0,     S1,    0x00
+    vld       U1,     S1,    0x10
+
+    vst       U0,     S9,    0x00
+    vst       U1,     S9,    0x10
+
+    addi.d     S1,     S1,    0x20
+
+.L_M2N2:  /* if(n&1) */
+    andi       I,      N,    0x01
+    beq        ZERO,   I,    .L_M0
+
+    vld       U0,     S1,    0x00
+
+    vst       U0,     S10,    0x00
+
+.L_M0:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemm_tcopy_8_lasx.S b/kernel/loongarch64/zgemm_tcopy_8_lasx.S
new file mode 100644
index 000000000..f7440dc24
--- /dev/null
+++ b/kernel/loongarch64/zgemm_tcopy_8_lasx.S
@@ -0,0 +1,268 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TS,     SRC          //aoffset
+    move       TD,     DST          //boffset
+    slli.d     TL,     LDA,   0x03  //lda
+    slli.d     TL,     TL,    0x01
+
+    srai.d     J,      N,     0x03  //j
+
+    beq        J,      ZERO,  .L_N1
+
+.L_J1:  /* if(j>0) j--*/
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x80
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_J1M1
+
+.L_J1I1:  /* if(i>0) i--*/
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+    xvld       U4,     S2,    0x00
+    xvld       U5,     S2,    0x20
+    xvld       U6,     S2,    0x40
+    xvld       U7,     S2,    0x60
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+    xvst       U2,     TD,    0x40
+    xvst       U3,     TD,    0x60
+    xvst       U4,     TD,    0x80
+    xvst       U5,     TD,    0xa0
+    xvst       U6,     TD,    0xc0
+    xvst       U7,     TD,    0xe0
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x100
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_J1I1
+
+.L_J1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_J0
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S1,    0x40
+    xvld       U3,     S1,    0x60
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+    xvst       U2,     TD,    0x40
+    xvst       U3,     TD,    0x60
+
+    addi.d     TD,     TD,    0x80
+
+.L_J0:
+    addi.d     J,      J,     -1
+    blt        ZERO,   J,     .L_J1
+
+.L_N1:  /* if(n&4) */
+    andi       I,      N,     0x04
+    beq        ZERO,   I,     .L_N2
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x40
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N1M1
+
+.L_N1I1:   /* if(i>0) i-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+    xvld       U2,     S2,    0x00
+    xvld       U3,     S2,    0x20
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+    xvst       U2,     TD,    0x40
+    xvst       U3,     TD,    0x60
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x80
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N1I1
+
+.L_N1M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N2
+
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S1,    0x20
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+
+    addi.d     TD,     TD,    0x40
+
+.L_N2:  /* if(n&2) */
+    andi       I,      N,     0x02
+    beq        ZERO,   I,     .L_N3
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+    addi.d     TS,     TS,    0x20
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N2M1
+
+.L_N2I1:  /* if(i>0) i-- */
+    xvld       U0,     S1,    0x00
+    xvld       U1,     S2,    0x00
+
+    xvst       U0,     TD,    0x00
+    xvst       U1,     TD,    0x20
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+
+    addi.d     TD,     TD,    0x40
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N2I1
+
+.L_N2M1:   /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N3
+
+    xvld       U0,     S1,    0x00
+
+    xvst       U0,     TD,    0x00
+
+    addi.d     TD,     TD,    0x20
+
+.L_N3:   /* if(n&1) */
+    andi       I,      N,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    move       S1,     TS     //aoffset1
+    slli.d     T0,     TL,    0x01  //2*lda
+    add.d      S2,     TS,    TL
+
+    srai.d     I,      M,     0x01
+    beq        ZERO,   I,     .L_N3M1
+
+.L_N3I1:  /* if(i>0) i-- */
+    vld       $vr0,     S1,    0x00
+    vld       $vr1,     S2,    0x00
+
+    vst       $vr0,     TD,    0x00
+    vst       $vr1,     TD,    0x10
+
+    add.d      S1,     S1,    T0
+    add.d      S2,     S2,    T0
+    addi.d     TD,     TD,    0x20
+
+    addi.d     I,      I,     -1
+    blt        ZERO,   I,     .L_N3I1
+
+.L_N3M1:  /* if(m&1) */
+    andi       I,      M,     0x01
+    beq        ZERO,   I,     .L_N0
+
+    vld       $vr0,     S1,    0x00
+
+    vst       $vr0,     TD,    0x00
+
+.L_N0:
+    LDARG      $r23,   $sp,   0
+    addi.d     $sp,    $sp,   8
+    jirl       $r0,    $r1,   0x00
+
+    EPILOGUE
\ No newline at end of file
diff --git a/kernel/loongarch64/zgemv_n_2_lsx.S b/kernel/loongarch64/zgemv_n_2_lsx.S
new file mode 100644
index 000000000..efb376118
--- /dev/null
+++ b/kernel/loongarch64/zgemv_n_2_lsx.S
@@ -0,0 +1,296 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M16     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $vr1
+#define X0      $vr2
+#define X1      $vr3
+#define X2      $vr4
+#define X3      $vr5
+#define X4      $vr6
+#define X5      $vr7
+#define X6      $vr8
+#define X7      $vr9
+#define Y0      $vr10
+#define Y1      $vr11
+#define A0      $vr12
+#define A1      $vr13
+#define A2      $vr14
+#define A3      $vr15
+#define A4      $vr16
+#define A5      $vr17
+#define A6      $vr18
+#define A7      $vr19
+#define A8      $vr20
+#define A9      $vr21
+#define A10     $vr22
+#define A11     $vr23
+#define A12     $vr24
+#define A13     $vr25
+#define A14     $vr26
+#define A15     $vr27
+#define TMP0    $vr28
+#define TMP1    $vr29
+#define TMP2    $vr30
+
+#if !defined(CONJ)
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  0
+#else
+#define GXCONJ 1
+#define GCONJ  0
+#endif
+#else
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  1
+#else
+#define GXCONJ 1
+#define GCONJ  1
+#endif
+#endif
+
+.macro ZLOAD_X_2
+    GLD v, , X0, X, 0x00, X1, X, 0x10
+    GCOMPLEXMUL GXCONJ, \
+    vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_X_2_GAP
+    vld         X0,     X,      0
+    PTR_ADD     T0,     X,      INC_X
+    vld         X1,     T0,     0
+
+    GCOMPLEXMUL GXCONJ, \
+    vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_X_1
+    GLD v, , X0, X, 0x00
+    GCOMPLEXMUL GXCONJ, \
+    vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_Y_2
+    GLD v, , Y0, Y, 0, Y1, Y, 0x10
+.endm
+
+.macro ZLOAD_Y_2_GAP
+    vld  $vr10, Y, 0
+    vldx $vr11, Y, INC_Y
+.endm
+
+.macro ZLOAD_Y_1
+    vld $vr10, Y, 0
+.endm
+
+.macro ZGEMV_N_2x2
+    GLD_INC v, , 0x10,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, d, Y0, X0, A0,  Y0, TMP0, TMP1, TMP2, Y1, X0, A1,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, Y1, X1, A3,  Y1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_N_1x2
+    GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, d, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_N_1x1
+    GLD_INC v, , 0x10, $vr12, PA0, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    vf, d, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZSTORE_Y_2
+    GST v, , Y0, Y, 0, Y1, Y, 0x10
+.endm
+
+.macro ZSTORE_Y_2_GAP
+    vst  Y0, Y, 0
+    vstx Y1, Y, INC_Y
+.endm
+
+.macro ZSTORE_Y_1
+    vst $vr10, Y, 0
+.endm
+
+.macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req
+    PTR_SRLI  J,      N,      1
+    beqz      J,      .L_\XW\()_N_1
+    PTR_SLLI  K_LDA,  LDA,    1
+    PTR_SUB   K_LDA,  K_LDA,  M16
+.L_\XW\()_N_L2:
+    ZLOAD_\X_2
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       1
+    beqz      I,      .L_\XW\()_M_1
+.align 5
+.L_\XW\()_M_L2:
+    ZLOAD_\Y_2
+    ZGEMV_N_2x2
+    ZSTORE_\Y_2
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  1
+    PTR_ADDI    K,      K,      4
+    bnez        I,      .L_\XW\()_M_L2
+.L_\XW\()_M_1:
+    andi        I,      M,      1
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    ZLOAD_\Y_1
+    ZGEMV_N_1x2
+    ZSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  1
+    bnez        J,      .L_\XW\()_N_L2
+.L_\XW\()_N_1:
+    andi        J,      N,      1
+    beqz        J,      .L_END
+.L_\XW\()_N_L1:
+    ZLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    ZLOAD_\Y_1
+    ZGEMV_N_1x1
+    ZSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI    J,      J,      -1
+    PTR_SUB     K_LDA,  LDA,    M16
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     X,      X,      INC_X
+    bnez        J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 31
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
+    // Init VALPHA
+    vpackev.d      VALPHA,   $vr1,   $vr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA
+#else
+    GADD , d, PA1, PA0, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0 // Obtain the offset address
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 31
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/zgemv_n_4_lasx.S b/kernel/loongarch64/zgemv_n_4_lasx.S
new file mode 100644
index 000000000..26edf1ed7
--- /dev/null
+++ b/kernel/loongarch64/zgemv_n_4_lasx.S
@@ -0,0 +1,343 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2024/02/20 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define Y_ORG   $r15
+#define OFFSET  $r16
+#define K_LDA   $r17
+#define M16     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+
+#define VALPHA  $xr1
+#define X0      $xr2
+#define X1      $xr3
+#define X2      $xr4
+#define X3      $xr5
+#define X4      $xr6
+#define X5      $xr7
+#define X6      $xr8
+#define X7      $xr9
+#define Y0      $xr10
+#define Y1      $xr11
+#define A0      $xr12
+#define A1      $xr13
+#define A2      $xr14
+#define A3      $xr15
+#define A4      $xr16
+#define A5      $xr17
+#define A6      $xr18
+#define A7      $xr19
+#define A8      $xr20
+#define A9      $xr21
+#define A10     $xr22
+#define A11     $xr23
+#define A12     $xr24
+#define A13     $xr25
+#define A14     $xr26
+#define A15     $xr27
+#define TMP0    $xr28
+#define TMP1    $xr29
+#define TMP2    $xr30
+
+#if !defined(CONJ)
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  0
+#else
+#define GXCONJ 1
+#define GCONJ  0
+#endif
+#else
+#if !defined(XCONJ)
+#define GXCONJ 0
+#define GCONJ  1
+#else
+#define GXCONJ 1
+#define GCONJ  1
+#endif
+#endif
+
+.macro ZLOAD_X_4
+    GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
+    GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
+    GCOMPLEXMUL GXCONJ, \
+    xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_X_4_GAP
+    xvld        X0,     X,      0
+    xvpermi.q   X0,     X0,     0
+
+    PTR_ADD     T0,     X,      INC_X
+    xvld        X1,     T0,     0
+    xvpermi.q   X1,     X1,     0
+
+    PTR_ADD     T0,     T0,     INC_X
+    xvld        X2,     T0,     0
+    xvpermi.q   X2,     X2,     0
+
+    PTR_ADD     T0,     T0,     INC_X
+    xvld        X3,     T0,     0
+    xvpermi.q   X3,     X3,     0
+
+    GCOMPLEXMUL GXCONJ, \
+    xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
+    X1, VALPHA, X1, TMP0, TMP1, TMP2, \
+    X2, VALPHA, X2, TMP0, TMP1, TMP2, \
+    X3, VALPHA, X3, TMP0, TMP1, TMP2
+.endm
+
+.macro ZLOAD_Y_4
+    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro ZLOAD_Y_4_GAP
+    vld  $vr10, Y, 0
+    vldx $vr13, Y, INC_Y
+    PTR_ALSL  T0,   INC_Y,  Y,  1
+    vld  $vr11, T0, 0
+    vldx $vr14, T0, INC_Y
+    GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02
+.endm
+
+.macro ZGEMV_N_4x4
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, d, Y0, X0, A0,  Y0, TMP0, TMP1, TMP2, Y1, X0, A1,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, Y1, X1, A3,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, Y1, X2, A5,  Y1, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, Y1, X3, A7,  Y1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZSTORE_Y_4
+    GST xv, , Y0, Y, 0, Y1, Y, 0x20
+.endm
+
+.macro ZSTORE_Y_4_GAP
+    xvstelm.d   Y0,     Y,      0,      0
+    xvstelm.d   Y0,     Y,      0x08,   1
+    PTR_ADD     T0,     Y,      INC_Y
+    xvstelm.d   Y0,     T0,     0,      2
+    xvstelm.d   Y0,     T0,     0x08,   3
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      0
+    xvstelm.d   Y1,     T0,     0x08,   1
+    PTR_ADD     T0,     T0,     INC_Y
+    xvstelm.d   Y1,     T0,     0,      2
+    xvstelm.d   Y1,     T0,     0x08,   3
+.endm
+
+.macro ZLOAD_Y_1
+    vld $vr10, Y, 0
+.endm
+
+.macro ZGEMV_N_1x4
+    GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, d, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2, \
+    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, \
+    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZSTORE_Y_1
+    vst $vr10, Y, 0
+.endm
+
+.macro ZLOAD_X_1
+    GLD xv, , X0, X, 0x00
+    GPERMI xv, q, X0, X0, 0
+    GCOMPLEXMUL GXCONJ, \
+    xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_N_1x1
+    GLD_INC v, , 0x10, $vr12, PA0, 0
+    GCOMPLEXMADD GXCONJ, GCONJ, \
+    xvf, d, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
+    PTR_SRLI  J,      N,      2
+    beqz      J,      .L_\XW\()_N_3
+    PTR_SLLI  K_LDA,  LDA,    2
+    PTR_SUB   K_LDA,  K_LDA,  M16
+.L_\XW\()_N_L4:
+    ZLOAD_\X_4
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    PTR_SRLI  I,      M,       2
+    beqz      I,      .L_\XW\()_M_3
+.align 5
+.L_\XW\()_M_L4:
+    ZLOAD_\Y_4
+    ZGEMV_N_4x4
+    ZSTORE_\Y_4
+    PTR_ADDI    I,      I,      -1
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    PTR_ADDI    K,      K,      4
+    bnez        I,      .L_\XW\()_M_L4
+.L_\XW\()_M_3:
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    ZLOAD_\Y_1
+    ZGEMV_N_1x4
+    ZSTORE_\Y_1
+    PTR_ADDI    I,      I,      -1
+    PTR_ADD     Y,      Y,      INC_Y
+    PTR_ADDI    K,      K,      1
+    bnez        I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    PTR_ALSL    X,      INC_X,  X,  2
+    bnez        J,      .L_\XW\()_N_L4
+.L_\XW\()_N_3:
+    andi        J,      N,      3
+    beqz        J,      .L_END
+.L_\XW\()_N_L1:
+    ZLOAD_\X_1
+    xor     K,      K,      K
+    move    Y,      Y_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    ZLOAD_\Y_1
+    ZGEMV_N_1x1
+    ZSTORE_\Y_1
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   Y,      Y,      INC_Y
+    PTR_ADDI  K,      K,      1
+    bnez    I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI    J,      J,      -1
+    PTR_SUB     K_LDA,  LDA,    M16
+    PTR_ADD     PA0,    PA0,    K_LDA
+    PTR_ADD     X,      X,      INC_X
+    bnez        J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 7, 31
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    PTR_SUB    J,      INC_Y,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
+    PTR_ALSL   I,      I,       J,      1
+    GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
+    // Init VALPHA
+    xvpackev.d      $xr0,   $xr1,   $xr0
+    xvreplve0.q     VALPHA, $xr0
+    move     Y_ORG,  Y
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0 // Obtain the offset address
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword  .L_GAP_0_0 - .L_GAP_TABLE
+    .hword  .L_GAP_0_1 - .L_GAP_TABLE
+    .hword  .L_GAP_1_0 - .L_GAP_TABLE
+    .hword  .L_GAP_1_1 - .L_GAP_TABLE
+.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
+    ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1
+.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
+    ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
+.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
+    ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
+.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
+    ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
+.L_END:
+    pop_if_used 17 + 7, 31
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
+
diff --git a/kernel/loongarch64/zgemv_t_2_lsx.S b/kernel/loongarch64/zgemv_t_2_lsx.S
new file mode 100644
index 000000000..2a0fc172e
--- /dev/null
+++ b/kernel/loongarch64/zgemv_t_2_lsx.S
@@ -0,0 +1,268 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M16     $r30
+
+#define VALPHA  $vr0
+#define X0      $vr1
+#define X1      $vr2
+#define A0      $vr3
+#define A1      $vr4
+#define A2      $vr5
+#define A3      $vr6
+#define A4      $vr7
+#define A5      $vr8
+#define A6      $vr9
+#define A7      $vr10
+#define A8      $vr11
+#define A9      $vr12
+#define A10     $vr13
+#define A11     $vr14
+#define A12     $vr15
+#define A13     $vr16
+#define A14     $vr17
+#define A15     $vr18
+#define TP0     $vr19
+#define TP1     $vr20
+#define TP2     $vr21
+#define TP3     $vr22
+#define TP4     $vr23
+#define TP5     $vr24
+#define TP6     $vr25
+#define TP7     $vr26
+#define TMP0    $vr27
+#define TMP1    $vr28
+#define TMP2    $vr29
+#define Y0      $vr3
+#define Y1      $vr4
+#define Y2      $vr5
+#define Y3      $vr6
+#define Y4      $vr7
+#define Y5      $vr8
+#define Y6      $vr9
+#define Y7      $vr10
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define GXCONJ1 0
+#define GCONJ1  0
+#else
+#define GXCONJ1 1
+#define GCONJ1  0
+#endif
+
+#if !defined(XCONJ)
+#define GXCONJ2 0
+#define GCONJ2  0
+#else
+#define GXCONJ2 0
+#define GCONJ2  1
+#endif
+
+.macro ZERO_Y2
+    GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1
+.endm
+
+.macro ZERO_Y1
+    GXOR v, v, TP0, TP0, TP0
+.endm
+
+.macro ZLOAD_X2
+    GLD v, , X0, X, 0x00, X1, X, 0x10
+.endm
+
+.macro ZLOAD_X2_GAP
+    vld        X0,     X,      0
+    vldx       X1,     X,      INC_X
+.endm
+
+.macro ZGEMV_T_2x2
+    GLD_INC v, , 0x10,        \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, d, TP0, A0,  X0, TP0, TMP0, TMP1, TMP2, TP0, A1,  X1, TP0, TMP0, TMP1, TMP2, \
+    TP1, A2,  X0, TP1, TMP0, TMP1, TMP2, TP1, A3,  X1, TP1, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_T_LSX XW:req, X2:req
+    PTR_SRLI  J,      N,      1
+    beqz      J,      .L_\XW\()_N_1
+    PTR_SLLI  K_LDA,  LDA,    1
+    PTR_SUB   K_LDA,  K_LDA,  M16
+.L_\XW\()_N_L2:
+    ZERO_Y2
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       1
+    beqz      I,      .L_\XW\()_M_1
+.align 5
+.L_\XW\()_M_L2:
+    ZLOAD_\X2
+    ZGEMV_T_2x2
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  1
+    bnez        I,  .L_\XW\()_M_L2
+.L_\XW\()_M_1:
+    // Accumulated
+    GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1
+    andi        I,      M,      1
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10
+#else
+    GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
+#endif
+
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, d, A0, A8,  X0, A0, TMP0, TMP1, TMP2, A1, A9,  X0, A1, TMP0, TMP1, TMP2
+
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    vld  A8, Y, 0x00
+    vldx A9, Y, INC_Y
+
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    vf, d, A8,  VALPHA, A0, A8,  TMP0, TMP1, TMP2, A9,  VALPHA, A1, A9,  TMP0, TMP1, TMP2
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
+#endif
+    vst  $vr11, Y,   0x00
+    vstx $vr12, Y,   INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  1
+    bnez        J,      .L_\XW\()_N_L2
+.L_\XW\()_N_1:
+    andi        J,      N,      1
+    beqz        J,      .L_END
+    PTR_SUB     K_LDA,  LDA,    M16
+.L_\XW\()_N_L1:
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    GLD v, , A0, PA0, 0x00, X0, X, 0x00
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x10
+    bnez      I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI  J,      J,      -1
+    vld       A0,     Y,      0x00
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
+    vst       $vr3,   Y,      0x00
+    PTR_ADD   PA0,    PA0,    K_LDA
+    PTR_ADD   Y,      Y,      INC_Y
+    bnez      J,      .L_\XW\()_N_L1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 30
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
+    // Init VALPHA
+    vpackev.d      VALPHA,   $vr1,   $vr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA
+#else
+    GADD , d, PA1, PA0, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    ZGEMV_T_LSX GAP_0, X2
+.L_GAP_1: /* if (incx != 1) */
+    ZGEMV_T_LSX GAP_1, X2_GAP
+.L_END:
+    pop_if_used 17 + 8, 30
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/zgemv_t_4_lasx.S b/kernel/loongarch64/zgemv_t_4_lasx.S
new file mode 100644
index 000000000..4d33b8f96
--- /dev/null
+++ b/kernel/loongarch64/zgemv_t_4_lasx.S
@@ -0,0 +1,299 @@
+/*******************************************************************************
+Copyright (c) 2024, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+#include "loongarch64_asm.S"
+
+/*********************************************************************
+* 2024/02/20 guxiwei
+*        UTEST                  : OK
+*        CTEST                  : OK
+*        TEST                   : OK
+*
+*
+*********************************************************************/
+
+/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
+ * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+ */
+#define M       $r4
+#define N       $r5
+#define ALPHA_R $f0
+#define ALPHA_I $f1
+#define A       $r7
+#define LDA     $r8
+#define X       $r9
+#define INC_X   $r10
+#define Y       $r11
+#define INC_Y   $r6
+
+#define J       $r12
+#define I       $r13
+#define K       $r14
+#define PY0     $r14
+#define X_ORG   $r15
+#define PY1     $r16
+#define K_LDA   $r17
+#define PY2     $r18
+#define T0      $r19
+#define PA0     $r20
+#define PA1     $r23
+#define PA2     $r24
+#define PA3     $r25
+#define PA4     $r26
+#define PA5     $r27
+#define PA6     $r28
+#define PA7     $r29
+#define M16     $r30
+
+#define VALPHA  $xr0
+#define X0      $xr1
+#define X1      $xr2
+#define A0      $xr3
+#define A1      $xr4
+#define A2      $xr5
+#define A3      $xr6
+#define A4      $xr7
+#define A5      $xr8
+#define A6      $xr9
+#define A7      $xr10
+#define A8      $xr11
+#define A9      $xr12
+#define A10     $xr13
+#define A11     $xr14
+#define A12     $xr15
+#define A13     $xr16
+#define A14     $xr17
+#define A15     $xr18
+#define TP0     $xr19
+#define TP1     $xr20
+#define TP2     $xr21
+#define TP3     $xr22
+#define TP4     $xr23
+#define TP5     $xr24
+#define TP6     $xr25
+#define TP7     $xr26
+#define TMP0    $xr27
+#define TMP1    $xr28
+#define TMP2    $xr29
+#define Y0      $xr3
+#define Y1      $xr4
+#define Y2      $xr5
+#define Y3      $xr6
+#define Y4      $xr7
+#define Y5      $xr8
+#define Y6      $xr9
+#define Y7      $xr10
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+#define GXCONJ1 0
+#define GCONJ1  0
+#else
+#define GXCONJ1 1
+#define GCONJ1  0
+#endif
+
+#if !defined(XCONJ)
+#define GXCONJ2 0
+#define GCONJ2  0
+#else
+#define GXCONJ2 0
+#define GCONJ2  1
+#endif
+
+.macro ZERO_Y4
+    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
+.endm
+
+.macro ZERO_Y1
+    GXOR xv, v, TP0, TP0, TP0
+.endm
+
+.macro ZLOAD_X4
+    GLD xv, , X0, X, 0x00, X1, X, 0x20
+.endm
+
+.macro ZLOAD_X4_GAP
+    xvld        X0,     X,      0
+
+    PTR_ADD     T0,     X,      INC_X
+    xvld        A0,     T0,     0
+    xvpermi.q   X0,     A0,     0x02
+
+    PTR_ADD     T0,     T0,     INC_X
+    xvld        X1,     T0,     0
+
+    PTR_ADD     T0,     T0,     INC_X
+    xvld        A0,     T0,     0
+    xvpermi.q   X1,     A0,     0x02
+.endm
+
+.macro ZGEMV_T_4x4
+    GLD_INC xv, , 0x20,       \
+    A0,  PA0, 0, A1,  PA0, 0, \
+    A2,  PA1, 0, A3,  PA1, 0, \
+    A4,  PA2, 0, A5,  PA2, 0, \
+    A6,  PA3, 0, A7,  PA3, 0
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, d, TP0, A0,  X0, TP0, TMP0, TMP1, TMP2, TP0, A1,  X1, TP0, TMP0, TMP1, TMP2, \
+    TP1, A2,  X0, TP1, TMP0, TMP1, TMP2, TP1, A3,  X1, TP1, TMP0, TMP1, TMP2, \
+    TP2, A4,  X0, TP2, TMP0, TMP1, TMP2, TP2, A5,  X1, TP2, TMP0, TMP1, TMP2, \
+    TP3, A6,  X0, TP3, TMP0, TMP1, TMP2, TP3, A7,  X1, TP3, TMP0, TMP1, TMP2
+.endm
+
+.macro ZGEMV_T_LASX XW:req, X4:req
+    PTR_SRLI  J,      N,      2
+    beqz      J,      .L_\XW\()_N_3
+    PTR_SLLI  K_LDA,  LDA,    2
+    PTR_SUB   K_LDA,  K_LDA,  M16
+.L_\XW\()_N_L4:
+    ZERO_Y4
+    move      X,      X_ORG
+    PTR_SRLI  I,      M,       2
+    beqz      I,      .L_\XW\()_M_3
+.align 5
+.L_\XW\()_M_L4:
+    ZLOAD_\X4
+    ZGEMV_T_4x4
+    PTR_ADDI    I,  I,  -1
+    PTR_ALSL    X,  INC_X,  X,  2
+    bnez        I,  .L_\XW\()_M_L4
+.L_\XW\()_M_3:
+    // Accumulated
+    GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
+    andi        I,      M,      3
+    beqz        I,      .L_\XW\()_M_END
+.align 5
+.L_\XW\()_M_L1:
+    GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00
+#if __loongarch_grlen == 64
+    GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
+#elif __loongarch_grlen == 32
+    GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
+#else
+    GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
+#endif
+
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, d, A0, A8,  X0, A0, TMP0, TMP1, TMP2, A1, A9,  X0, A1, TMP0, TMP1, TMP2, \
+    A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
+
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    bnez      I,      .L_\XW\()_M_L1
+.L_\XW\()_M_END:
+    xvld  A8, Y, 0x00
+    xvldx A9, Y, INC_Y
+    PTR_ALSL    PY0, INC_Y,  Y,  1
+    xvld  A10, PY0, 0x00
+    xvldx A11, PY0, INC_Y
+
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    xvf, d, A8,  VALPHA, A0, A8,  TMP0, TMP1, TMP2, A9,  VALPHA, A1, A9,  TMP0, TMP1, TMP2,\
+    A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
+
+    PTR_ADDI    J,      J,      -1
+#if __loongarch_grlen == 64
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#else
+    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
+#endif
+    vst  $vr11, Y,   0x00
+    vstx $vr12, Y,   INC_Y
+    vst  $vr13, PY0, 0x00
+    vstx $vr14, PY0, INC_Y
+    PTR_ALSL    Y,      INC_Y,  Y,  2
+    bnez        J,      .L_\XW\()_N_L4
+.L_\XW\()_N_3:
+    andi        J,      N,      3
+    beqz        J,      .L_END
+    PTR_SUB     K_LDA,  LDA,    M16
+.L_\XW\()_N_1:
+    ZERO_Y1
+    move    X,      X_ORG
+    move    I,      M
+    beqz    I,      .L_END
+.align 5
+.L_\XW\()_N_1_M_L1:
+    GLD xv, , A0, PA0, 0x00, X0, X, 0x00
+    GCOMPLEXMADD GXCONJ1, GCONJ1, \
+    xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
+    PTR_ADDI  I,      I,      -1
+    PTR_ADD   X,      X,      INC_X
+    PTR_ADDI  PA0,    PA0,    0x10
+    bnez      I,      .L_\XW\()_N_1_M_L1
+.L_\XW\()_N_1_M_END:
+    PTR_ADDI  J,      J,      -1
+    xvld      A0,     Y,      0x00
+    GCOMPLEXMADD GXCONJ2, GCONJ2, \
+    xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
+    vst       $vr3,   Y,      0x00
+    PTR_ADD   PA0,    PA0,    K_LDA
+    PTR_ADD   Y,      Y,      INC_Y
+    bnez      J,      .L_\XW\()_N_1
+
+    b .L_END
+.endm
+
+    PROLOGUE
+    PTR_LD     INC_Y,  $sp,    0
+    push_if_used 17 + 8, 30
+    PTR_ADDI   K,      $r0,     0x01
+    PTR_SUB    I,      INC_X,   K
+    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
+    GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
+    // Init VALPHA
+    xvpackev.d      $xr0,   $xr1,   $xr0
+    xvreplve0.q     VALPHA, $xr0
+    move     X_ORG,  X
+    move     PA0,    A
+#if __loongarch_grlen == 64
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#elif __loongarch_grlen == 32
+    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#else
+    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
+#endif
+    la.local    T0,     .L_GAP_TABLE
+    PTR_ALSL    I,      I,      T0,     1
+    ld.h        K,      I,      0
+    PTR_ADD     T0,     T0,     K
+    jirl        $r0,    T0,     0
+.L_GAP_TABLE:
+    .hword .L_GAP_0 - .L_GAP_TABLE
+    .hword .L_GAP_1 - .L_GAP_TABLE
+.L_GAP_0: /* if (incx == 1) */
+    ZGEMV_T_LASX GAP_0, X4
+.L_GAP_1: /* if (incx != 1) */
+    ZGEMV_T_LASX GAP_1, X4_GAP
+.L_END:
+    pop_if_used 17 + 8, 30
+    jirl    $r0, $r1, 0x0
+    EPILOGUE
diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S
index 49f640268..8e2165ab7 100644
--- a/kernel/loongarch64/znrm2.S
+++ b/kernel/loongarch64/znrm2.S
@@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    MTC  s1, $r0
    bge $r0,    N, .L999
    slli.d INCX, INCX, ZBASE_SHIFT
-   bge $r0,    INCX, .L999
+   beq $r0,    INCX, .L999
    move    XX, X
    MOV s2, s1
    srai.d  I, N, 2
diff --git a/kernel/loongarch64/znrm2_lasx.S b/kernel/loongarch64/znrm2_lasx.S
new file mode 100644
index 000000000..53f8a6e05
--- /dev/null
+++ b/kernel/loongarch64/znrm2_lasx.S
@@ -0,0 +1,252 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+#define a3     $f15
+#define a2     $f16
+#define VX0    $xr15
+#define VX1    $xr16
+#define VM0    $xr17
+#define VM1    $xr18
+#define VM2    $xr13
+#define VM3    $xr14
+#define res1   $xr19
+#define res2   $xr20
+#define VALPHA $xr21
+
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    xvxor.v   res1, res1, res1
+    xvxor.v   res2, res2, res2
+    xvxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmaxa.d VM1, VX1, VX0
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    add.d X, X, INCX
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    xvpickve.d VX0, VM0, 1
+    xvpickve.d VX1, VM0, 2
+    xvpickve.d VM3, VM0, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM2, VM3, VM0
+    xvfmaxa.d VM0, VM1, VM2
+    .align 3
+
+.L97:
+    andi I, N, 3
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    fld.d a3, X, 0 * SIZE
+    fld.d a2, X, 1 * SIZE
+    fmaxa.d a3, a2, a3
+    fmaxa.d max, a3, max
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    xvreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    srai.d  I, N, 2
+    bne INCX, TEMP, .L120
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    xvld VX0, XX, 0 * SIZE
+    xvld VX1, XX, 4 * SIZE
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    ld.d t1, XX, 0 * SIZE
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, XX, 0 * SIZE
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VM2, VX0, VALPHA
+    xvfmul.d VM3, VX1, VALPHA
+    xvfmadd.d res1, VM2, VM2, res1
+    xvfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    xvfadd.d res1, res1, res2
+    xvpickve.d VX0, res1, 1
+    xvpickve.d VX1, res1, 2
+    xvpickve.d VM2, res1, 3
+    xvfadd.d res1, VX0, res1
+    xvfadd.d VX1, VX1, VM2
+    xvfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d a3, XX, 0 * SIZE
+    fld.d a2, XX, 1 * SIZE
+    addi.d I, I, -1
+    fmul.d a3, a3, ALPHA
+    fmadd.d $f19, a3, a3, $f19
+    fmul.d a2, a2, ALPHA
+    fmadd.d $f19, a2, a2, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
diff --git a/kernel/loongarch64/znrm2_lsx.S b/kernel/loongarch64/znrm2_lsx.S
new file mode 100644
index 000000000..14c59d504
--- /dev/null
+++ b/kernel/loongarch64/znrm2_lsx.S
@@ -0,0 +1,260 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define XX     $r19
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r12
+#define t2     $r13
+#define t3     $r14
+#define t4     $r15
+#define INF    $f23
+#define a1     $f22
+#define max    $f17
+#define ALPHA  $f12
+#define a3     $f15
+#define a2     $f16
+#define VX0    $vr15
+#define VX1    $vr16
+#define VM0    $vr17
+#define VM1    $vr18
+#define VM2    $vr13
+#define VM3    $vr14
+#define res1   $vr19
+#define res2   $vr20
+#define VALPHA $vr21
+
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    vxor.v   res1, res1, res1
+    vxor.v   res2, res2, res2
+    vxor.v VM0, VM0, VM0
+    bge $r0, N, .L999
+    beq $r0, INCX, .L999
+    move  XX, X
+    // Init INF
+    addi.d  TEMP, $r0,  0x7FF
+    slli.d  TEMP, TEMP, 52
+    MTC  INF, TEMP
+    li.d  TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bge $r0,  I, .L97
+    .align 3
+
+.L10:
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmaxa.d VM1, VX1, VX0
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmaxa.d VM2, VX1, VX0
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L10
+    b .L96
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L97
+    .align 3
+
+.L21:
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM3, VM1, VM2
+    vfmaxa.d VM0, VM0, VM3
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    b .L96
+    .align 3
+
+.L96:
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    vfmaxa.d VM0, VX0, VX1
+    .align 3
+
+.L97:
+    andi I, N, 3
+    bge $r0, I, .L99
+    .align 3
+
+.L98:
+    fld.d a3, X, 0 * SIZE
+    fld.d a2, X, 1 * SIZE
+    fmaxa.d a3, a2, a3
+    fmaxa.d max, a3, max
+    addi.d I, I, -1
+    add.d X, X, INCX
+    blt $r0, I, .L98
+    .align 3
+
+.L99:
+    fabs.d  max, max
+    lu12i.w  TEMP, 0x3f800 // 1
+    movgr2fr.d  a1, $r0
+    movgr2fr.w  ALPHA, TEMP
+    CMPEQ  $fcc0, max, a1
+    fcvt.d.s  ALPHA, ALPHA
+    bcnez  $fcc0, .L999
+    fdiv.d  ALPHA, ALPHA, max
+    CMPEQ  $fcc0, INF, ALPHA
+    bcnez  $fcc0, .L999
+    movfr2gr.d TEMP, ALPHA
+    vreplgr2vr.d VALPHA, TEMP
+
+.L100:
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    srai.d  I, N, 2
+    bne INCX, TEMP, .L120
+    bge $r0, I, .L997
+    .align 3
+
+.L110:
+    vld VX0, XX, 0 * SIZE
+    vld VX1, XX, 2 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    vld VX0, XX, 4 * SIZE
+    vld VX1, XX, 6 * SIZE
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d XX, XX, 8 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L110
+    b .L996
+    .align 3
+
+.L120:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    ld.d t1, XX, 0 * SIZE
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    ld.d t1, XX, 0 * SIZE
+    vfmul.d VM3, VX1, VALPHA
+    ld.d t2, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    ld.d t3, XX, 0 * SIZE
+    ld.d t4, XX, 1 * SIZE
+    add.d XX, XX, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmul.d VM2, VX0, VALPHA
+    vfmul.d VM3, VX1, VALPHA
+    vfmadd.d res1, VM2, VM2, res1
+    vfmadd.d res2, VM3, VM3, res2
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L996:
+    vfadd.d res1, res1, res2
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    fld.d a3, XX, 0 * SIZE
+    fld.d a2, XX, 1 * SIZE
+    addi.d I, I, -1
+    fmul.d a3, a3, ALPHA
+    fmadd.d $f19, a3, a3, $f19
+    fmul.d a2, a2, ALPHA
+    fmadd.d $f19, a2, a2, $f19
+    add.d XX, XX , INCX
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    fsqrt.d $f19, $f19
+    fmul.d $f0, max, $f19
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600
index 9a6e06d67..c37b88adb 100644
--- a/kernel/mips/KERNEL.P5600
+++ b/kernel/mips/KERNEL.P5600
@@ -35,7 +35,7 @@ DSUMKERNEL  = ../mips/sum.c
 CSUMKERNEL  = ../mips/zsum.c
 ZSUMKERNEL  = ../mips/zsum.c
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SASUMKERNEL  = ../mips/sasum_msa.c
 DASUMKERNEL  = ../mips/dasum_msa.c
 CASUMKERNEL  = ../mips/casum_msa.c
@@ -47,7 +47,7 @@ CASUMKERNEL  = ../mips/zasum.c
 ZASUMKERNEL  = ../mips/zasum.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SAXPYKERNEL  = ../mips/saxpy_msa.c
 DAXPYKERNEL  = ../mips/daxpy_msa.c
 CAXPYKERNEL  = ../mips/caxpy_msa.c
@@ -59,7 +59,7 @@ CAXPYKERNEL  = ../mips/zaxpy.c
 ZAXPYKERNEL  = ../mips/zaxpy.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SCOPYKERNEL  = ../mips/scopy_msa.c
 DCOPYKERNEL  = ../mips/dcopy_msa.c
 CCOPYKERNEL  = ../mips/ccopy_msa.c
@@ -71,7 +71,7 @@ CCOPYKERNEL  = ../mips/zcopy.c
 ZCOPYKERNEL  = ../mips/zcopy.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SDOTKERNEL   = ../mips/sdot_msa.c
 DDOTKERNEL   = ../mips/ddot_msa.c
 CDOTKERNEL   = ../mips/cdot_msa.c
@@ -88,7 +88,7 @@ DNRM2KERNEL  = ../mips/nrm2.c
 CNRM2KERNEL  = ../mips/znrm2.c
 ZNRM2KERNEL  = ../mips/znrm2.c
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SROTKERNEL   = ../mips/srot_msa.c
 DROTKERNEL   = ../mips/drot_msa.c
 CROTKERNEL   = ../mips/crot_msa.c
@@ -100,11 +100,13 @@ CROTKERNEL   = ../mips/zrot.c
 ZROTKERNEL   = ../mips/zrot.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSCALKERNEL  = ../mips/sscal_msa.c
 DSCALKERNEL  = ../mips/dscal_msa.c
-CSCALKERNEL  = ../mips/cscal_msa.c
-ZSCALKERNEL  = ../mips/zscal_msa.c
+#CSCALKERNEL  = ../mips/cscal_msa.c
+#ZSCALKERNEL  = ../mips/zscal_msa.c
+CSCALKERNEL = ../mips/zscal.c
+ZSCALKERNEL = ../mips/zscal.c
 else
 SSCALKERNEL  = ../mips/scal.c
 DSCALKERNEL  = ../mips/scal.c
@@ -112,7 +114,7 @@ CSCALKERNEL  = ../mips/zscal.c
 ZSCALKERNEL  = ../mips/zscal.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSWAPKERNEL  = ../mips/sswap_msa.c
 DSWAPKERNEL  = ../mips/dswap_msa.c
 CSWAPKERNEL  = ../mips/cswap_msa.c
@@ -124,7 +126,7 @@ CSWAPKERNEL  = ../mips/zswap.c
 ZSWAPKERNEL  = ../mips/zswap.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMVNKERNEL = ../mips/sgemv_n_msa.c
 DGEMVNKERNEL = ../mips/dgemv_n_msa.c
 CGEMVNKERNEL = ../mips/cgemv_n_msa.c
@@ -136,7 +138,7 @@ CGEMVNKERNEL = ../mips/zgemv_n.c
 ZGEMVNKERNEL = ../mips/zgemv_n.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMVTKERNEL = ../mips/sgemv_t_msa.c
 DGEMVTKERNEL = ../mips/dgemv_t_msa.c
 CGEMVTKERNEL = ../mips/cgemv_t_msa.c
@@ -148,7 +150,7 @@ CGEMVTKERNEL = ../mips/zgemv_t.c
 ZGEMVTKERNEL = ../mips/zgemv_t.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
 SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
 SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
@@ -162,7 +164,7 @@ SGEMMONCOPYOBJ = sgemm_oncopy.o
 SGEMMOTCOPYOBJ = sgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DGEMMKERNEL    = ../mips/dgemm_kernel_8x4_msa.c
 DGEMMINCOPY    = ../mips/dgemm_ncopy_8_msa.c
 DGEMMITCOPY    = ../mips/dgemm_tcopy_8_msa.c
@@ -180,7 +182,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy.o
 DGEMMOTCOPYOBJ = dgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
 CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
 CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
@@ -198,7 +200,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy.o
 CGEMMOTCOPYOBJ = cgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
 ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
 ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
@@ -212,7 +214,7 @@ ZGEMMONCOPYOBJ = zgemm_oncopy.o
 ZGEMMOTCOPYOBJ = zgemm_otcopy.o
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
 STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
 STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
@@ -224,7 +226,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
 DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
 DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
@@ -236,7 +238,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@@ -248,7 +250,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
diff --git a/kernel/mips/nrm2.c b/kernel/mips/nrm2.c
index fcff09337..8cc189fe3 100644
--- a/kernel/mips/nrm2.c
+++ b/kernel/mips/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 
 	n *= inc_x;
diff --git a/kernel/mips/znrm2.c b/kernel/mips/znrm2.c
index 85be39cd1..d11a6bd4a 100644
--- a/kernel/mips/znrm2.c
+++ b/kernel/mips/znrm2.c
@@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 
 	inc_x2 = 2 * inc_x;
 
diff --git a/kernel/mips/zscal.c b/kernel/mips/zscal.c
index bca1155c1..7bb261941 100644
--- a/kernel/mips/zscal.c
+++ b/kernel/mips/zscal.c
@@ -47,6 +47,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 			else
 			{
 				temp = - da_i * x[ip+1] ;
+				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				x[ip+1] = da_i * x[ip]  ;
 			}
 		}
@@ -63,8 +64,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
 			}
 		}
-		x[ip]   = temp;
-
+		if ( da_r != da_r ) 
+			x[ip] = da_r;
+		else
+			x[ip]   = temp;
+		
 		ip += inc_x2;
 	}
 
diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4
index b81e5441d..1149d97f1 100644
--- a/kernel/mips64/KERNEL.LOONGSON3R4
+++ b/kernel/mips64/KERNEL.LOONGSON3R4
@@ -1,4 +1,4 @@
-ifdef HAVE_MSA
+ifndef NO_MSA
 SAXPYKERNEL = ../mips/saxpy_msa.c
 DAXPYKERNEL = ../mips/daxpy_msa.c
 CAXPYKERNEL = ../mips/caxpy_msa.c
@@ -8,14 +8,14 @@ SAXPYKERNEL = axpy_loongson3a.S
 DAXPYKERNEL = daxpy_loongson3a_simd.S
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SCOPYKERNEL  = ../mips/scopy_msa.c
 DCOPYKERNEL  = ../mips/dcopy_msa.c
 CCOPYKERNEL  = ../mips/ccopy_msa.c
 ZCOPYKERNEL  = ../mips/zcopy_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SDOTKERNEL   = ../mips/sdot_msa.c
 DDOTKERNEL   = ../mips/ddot_msa.c
 CDOTKERNEL   = ../mips/cdot_msa.c
@@ -23,21 +23,21 @@ ZDOTKERNEL   = ../mips/zdot_msa.c
 endif
 DSDOTKERNEL  = ../mips/dot.c
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SROTKERNEL   = ../mips/srot_msa.c
 DROTKERNEL   = ../mips/drot_msa.c
 CROTKERNEL   = ../mips/crot_msa.c
 ZROTKERNEL   = ../mips/zrot_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSCALKERNEL  = ../mips/sscal_msa.c
 DSCALKERNEL  = ../mips/dscal_msa.c
 CSCALKERNEL  = ../mips/cscal_msa.c
 ZSCALKERNEL  = ../mips/zscal_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMVNKERNEL = ../mips/sgemv_n_msa.c
 DGEMVNKERNEL = ../mips/dgemv_n_msa.c
 SGEMVTKERNEL = ../mips/sgemv_t_msa.c
@@ -57,21 +57,21 @@ ZGEMVNKERNEL = zgemv_n_loongson3a.c
 ZGEMVTKERNEL = zgemv_t_loongson3a.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SASUMKERNEL  = ../mips/sasum_msa.c
 DASUMKERNEL  = ../mips/dasum_msa.c
 CASUMKERNEL  = ../mips/casum_msa.c
 ZASUMKERNEL  = ../mips/zasum_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SSWAPKERNEL  = ../mips/sswap_msa.c
 DSWAPKERNEL  = ../mips/dswap_msa.c
 CSWAPKERNEL  = ../mips/cswap_msa.c
 ZSWAPKERNEL  = ../mips/zswap_msa.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
 SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
 SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
@@ -89,7 +89,7 @@ SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DGEMMKERNEL    = ../mips/dgemm_kernel_8x4_msa.c
 DGEMMINCOPY    = ../mips/dgemm_ncopy_8_msa.c
 DGEMMITCOPY    = ../mips/dgemm_tcopy_8_msa.c
@@ -107,7 +107,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
 CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
 CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
@@ -129,7 +129,7 @@ CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
 ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
 ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
@@ -143,7 +143,7 @@ ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
 STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
 STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
@@ -155,7 +155,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
 DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
 DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
@@ -167,7 +167,7 @@ DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@@ -179,7 +179,7 @@ CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
 CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
 endif
 
-ifdef HAVE_MSA
+ifndef NO_MSA
 ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
 ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S
index 76fa9c295..159f9bea9 100644
--- a/kernel/mips64/cnrm2.S
+++ b/kernel/mips64/cnrm2.S
@@ -77,7 +77,7 @@
 	blez	N, .L999
 	mov.d	s2, s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, ZBASE_SHIFT
 
 	dsra	I, N, 2
diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S
index cd40414a2..1b55d9fc3 100644
--- a/kernel/mips64/dnrm2.S
+++ b/kernel/mips64/dnrm2.S
@@ -81,7 +81,7 @@
 	blez	N, .L999
 	MTC	$0,  s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, BASE_SHIFT
 
 	move	XX, X
diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S
index 1ba061a7d..f18151b5c 100644
--- a/kernel/mips64/snrm2.S
+++ b/kernel/mips64/snrm2.S
@@ -77,7 +77,7 @@
 	blez	N, .L999
 	mov.d	s2, s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, BASE_SHIFT
 
 	bne	INCX, TEMP, .L20
diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S
index 1c247bca9..d33284a47 100644
--- a/kernel/mips64/znrm2.S
+++ b/kernel/mips64/znrm2.S
@@ -80,7 +80,7 @@
 	blez	N, .L999
 	MTC	$0,  s1
 
-	blez	INCX, .L999
+	beqz	INCX, .L999
 	dsll	INCX, INCX, ZBASE_SHIFT
 
 	move	XX, X
diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 79d889fe0..a1cdc8639 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -1,6 +1,3 @@
-ifeq ($(HAVE_GAS), 1)
-include $(KERNELDIR)/KERNEL.POWER8
-else
 #SGEMM_BETA = ../generic/gemm_beta.c
 #DGEMM_BETA = ../generic/gemm_beta.c
 #CGEMM_BETA = ../generic/zgemm_beta.c
@@ -19,11 +16,11 @@ SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 STRMMKERNEL	= sgemm_kernel_power10.c
 DTRMMKERNEL	= dgemm_kernel_power10.c
-CTRMMKERNEL	= cgemm_kernel_power10.S
-ZTRMMKERNEL	= zgemm_kernel_power10.S
+CTRMMKERNEL	= cgemm_kernel_power10.c
+ZTRMMKERNEL	= zgemm_kernel_power10.c
 
 SGEMMKERNEL    =  sgemm_kernel_power10.c
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
@@ -62,10 +59,14 @@ DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_power10.c
 DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_power10.c
 DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_power10.c
 
-CGEMMKERNEL    = cgemm_kernel_power10.S
+CGEMMKERNEL    = cgemm_kernel_power10.c
 #CGEMMKERNEL     = cgemm_kernel_8x4_power8.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+ifeq ($(OSNAME), AIX)
+CGEMMITCOPY    = cgemm_tcopy_8_power8.S
+else
 CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+endif
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
@@ -73,7 +74,7 @@ CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
-ZGEMMKERNEL    = zgemm_kernel_power10.S
+ZGEMMKERNEL    = zgemm_kernel_power10.c
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
 ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
@@ -124,6 +125,7 @@ ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 #SMINKERNEL   = ../arm/min.c
 #DMINKERNEL   = ../arm/min.c
 #
+ifeq ($(C_COMPILER), GCC)
 ifneq ($(GCCVERSIONGTEQ9),1)
 ISAMAXKERNEL = isamax_power9.S
 else
@@ -148,6 +150,15 @@ ICAMINKERNEL = icamin_power9.S
 else
 ICAMINKERNEL = icamin.c
 endif
+else
+ISAMAXKERNEL = isamax.c
+IDAMAXKERNEL = idamax.c
+ICAMAXKERNEL = icamax.c
+IZAMAXKERNEL = izamax.c
+ISAMINKERNEL = isamin.c
+IDAMINKERNEL = idamin.c
+ICAMINKERNEL = icamin.c
+endif
 IZAMINKERNEL = izamin.c
 #
 #ISMAXKERNEL  = ../arm/imax.c
@@ -238,4 +249,3 @@ QCABS_KERNEL	= ../generic/cabs.c
 #Dump kernel
 CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
 ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
-endif
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 2b8e65948..700a68e44 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -50,7 +50,7 @@ CTRMMKERNEL	= ctrmm_kernel_8x4_power8.S
 ZTRMMKERNEL	= ztrmm_kernel_8x2_power8.S
 
 SGEMMKERNEL    =  sgemm_kernel_16x8_power8.S
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index b6b102b3e..7d007d1a2 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -13,7 +13,7 @@ CTRMMKERNEL	= cgemm_kernel_power9.S
 ZTRMMKERNEL	= zgemm_kernel_power9.S
 
 SGEMMKERNEL    =  sgemm_kernel_power9.S
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMINCOPY    = sgemm_ncopy_16_power.c
 SGEMMITCOPY    = sgemm_tcopy_16_power8.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMOTCOPY    = sgemm_tcopy_8_power8.S
diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970
index a99fb7d96..fee5fa529 100644
--- a/kernel/power/KERNEL.PPC970
+++ b/kernel/power/KERNEL.PPC970
@@ -1,11 +1,11 @@
 ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__)
 SGEMMKERNEL    =  gemm_kernel.S
-SGEMMINCOPY    =  
-SGEMMITCOPY    =  
+SGEMMINCOPY    =
+SGEMMITCOPY    =
 SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =  
-SGEMMITCOPYOBJ =  
+SGEMMINCOPYOBJ =
+SGEMMITCOPYOBJ =
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 else
diff --git a/kernel/power/KERNEL.PPCG4 b/kernel/power/KERNEL.PPCG4
index 1bdd3119e..c73601cee 100644
--- a/kernel/power/KERNEL.PPCG4
+++ b/kernel/power/KERNEL.PPCG4
@@ -96,9 +96,9 @@ CGEMMINCOPY    =
 CGEMMONCOPY    =
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c
-CGEMMINCOPYOBJ =  
+CGEMMINCOPYOBJ =
 #cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  
+CGEMMITCOPYOBJ =
 #cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/power/cgemm_kernel_power10.c b/kernel/power/cgemm_kernel_power10.c
new file mode 100644
index 000000000..233768cef
--- /dev/null
+++ b/kernel/power/cgemm_kernel_power10.c
@@ -0,0 +1,1115 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
+
+#define SET_ACC_ZERO()                 \
+      __builtin_mma_xxsetaccz (&acc0); \
+      __builtin_mma_xxsetaccz (&acc1); \
+      __builtin_mma_xxsetaccz (&acc2); \
+      __builtin_mma_xxsetaccz (&acc3); \
+      __builtin_mma_xxsetaccz (&acc4); \
+      __builtin_mma_xxsetaccz (&acc5); \
+      __builtin_mma_xxsetaccz (&acc6); \
+      __builtin_mma_xxsetaccz (&acc7);
+
+#if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  =  _arbi + _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag +=  _arbi + _aibr; }
+#endif
+
+#if (defined(NR) || defined(NC) || defined(TR) || defined(TC))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr + _aibi; _imag  = -_arbi + _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; }
+#endif
+
+#if (defined(RN) || defined(RT) || defined(CN) || defined(CT))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr + _aibi; _imag  =  _arbi - _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag +=  _arbi - _aibr; }
+#endif
+
+#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) 
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  = -_arbi - _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; }
+#endif
+
+#if defined (TRMMKERNEL)
+#define A_OP =
+#else
+#define A_OP +=
+#endif
+
+#define BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
+      __builtin_mma_disassemble_acc ((void *)result,      &acc0);    \
+      __builtin_mma_disassemble_acc ((void *)&result[ 4], &acc1);    \
+      __builtin_mma_disassemble_acc ((void *)&result[ 8], &acc2);    \
+      __builtin_mma_disassemble_acc ((void *)&result[12], &acc3);    \
+      __builtin_mma_disassemble_acc ((void *)&result[16], &acc4);    \
+      __builtin_mma_disassemble_acc ((void *)&result[20], &acc5);    \
+      __builtin_mma_disassemble_acc ((void *)&result[24], &acc6);    \
+      __builtin_mma_disassemble_acc ((void *)&result[28], &acc7);
+
+#define COMP_MUL_1                                                   \
+      COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])   
+
+#define COMP_MAC_1(_offset) {                                        \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+}
+
+#define COMP_MUL_2A                                                  \
+      COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])     \
+      COMP_MUL(tr[1], res[ 2], res[ 7], ti[1], res[ 3], res[ 6])
+
+#define COMP_MAC_2A(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MAC(tr[1], _ro[ 2], _ro[ 7], ti[1], _ro[ 3], _ro[ 6])     \
+}
+
+#define COMP_MUL_2B                                                  \
+      COMP_MUL(tr[0], res[ 0], res[ 5], ti[0], res[ 1], res[ 4])     \
+      COMP_MUL(tr[1], res[ 8], res[13], ti[1], res[ 9], res[12])
+
+#define COMP_MAC_2B(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+}
+
+#define COMP_MUL_4A(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+      COMP_MUL(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20])     \
+      COMP_MUL(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28])     \
+}
+
+#define COMP_MAC_4A(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+      COMP_MAC(tr[2], _ro[16], _ro[21], ti[2], _ro[17], _ro[20])     \
+      COMP_MAC(tr[3], _ro[24], _ro[29], ti[3], _ro[25], _ro[28])     \
+}
+
+#define COMP_MUL_4B(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MUL(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MUL(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+      COMP_MUL(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6])     \
+      COMP_MUL(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14])     \
+}
+
+#define COMP_MAC_4B(_offset) {                                       \
+      FLOAT *_ro = &res[_offset];                                    \
+      COMP_MAC(tr[0], _ro[ 0], _ro[ 5], ti[0], _ro[ 1], _ro[ 4])     \
+      COMP_MAC(tr[1], _ro[ 8], _ro[13], ti[1], _ro[ 9], _ro[12])     \
+      COMP_MAC(tr[2], _ro[ 2], _ro[ 7], ti[2], _ro[ 3], _ro[ 6])     \
+      COMP_MAC(tr[3], _ro[10], _ro[15], ti[3], _ro[11], _ro[14])     \
+}
+
+
+#define SAVE_ACC_COMPLEX_11                                          \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_1                                                     \
+      COMP_MAC_1(16)                                                 \
+      COMP_MAC_1(32)                                                 \
+      COMP_MAC_1(48)                                                 \
+      COMP_MAC_1(64)                                                 \
+      COMP_MAC_1(80)                                                 \
+      COMP_MAC_1(96)                                                 \
+      COMP_MAC_1(112)                                                \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_12                                          \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_2A                                                    \
+      COMP_MAC_2A(16)                                                \
+      COMP_MAC_2A(32)                                                \
+      COMP_MAC_2A(48)                                                \
+      COMP_MAC_2A(64)                                                \
+      COMP_MAC_2A(80)                                                \
+      COMP_MAC_2A(96)                                                \
+      COMP_MAC_2A(112)                                               \
+      CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+      CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+      CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+      CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_1                                        \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_2B                                                    \
+      COMP_MAC_2B(16)                                                \
+      COMP_MAC_2B(32)                                                \
+      COMP_MAC_2B(48)                                                \
+      COMP_MAC_2B(64)                                                \
+      COMP_MAC_2B(80)                                                \
+      COMP_MAC_2B(96)                                                \
+      COMP_MAC_2B(112)                                               \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                  \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                  \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_2                                        \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_4A(0)                                                 \
+      COMP_MAC_4A(32)                                                \
+      COMP_MAC_4A(64)                                                \
+      COMP_MAC_4A(96)                                                \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                  \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                  \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                  \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                  \
+      CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                  \
+      CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                  \
+      CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                  \
+      CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_4                                        \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL_4A(0)                                                 \
+      COMP_MAC_4A(64)                                                \
+      CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                 \
+      CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                 \
+      CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                 \
+      CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                 \
+      CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                 \
+      CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                 \
+      CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                 \
+      CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;                 \
+      COMP_MUL_4A(32)                                                \
+      COMP_MAC_4A(96)                                                \
+      CO[ 8] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                 \
+      CO[ 9] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                 \
+      CO[10] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                 \
+      CO[11] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                 \
+      CO[12] A_OP tr[2] * alpha_r - ti[2] * alpha_i;                 \
+      CO[13] A_OP ti[2] * alpha_r + tr[2] * alpha_i;                 \
+      CO[14] A_OP tr[3] * alpha_r - ti[3] * alpha_i;                 \
+      CO[15] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_22_4                                    \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                              \
+      COMP_MUL_4B(0)                                             \
+      CO[ 0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[ 1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[ 2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[ 3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[2*ldc+ 0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[2*ldc+ 1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[2*ldc+ 2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[2*ldc+ 3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+      COMP_MUL_4B(16)                                            \
+      CO[ 4]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[ 5]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[ 6]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[ 7]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[2*ldc+ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[2*ldc+ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[2*ldc+ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[2*ldc+ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+      COMP_MUL_4B(32)                                            \
+      CO[ 8]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[ 9]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[10]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[11]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[2*ldc+ 8] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[2*ldc+ 9] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[2*ldc+10] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[2*ldc+11] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+      COMP_MUL_4B(48)                                            \
+      CO[12]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[13]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[14]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[15]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[2*ldc+12] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[2*ldc+13] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[2*ldc+14] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[2*ldc+15] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
+
+#define SAVE_ACC_COMPLEX_22_2                                      \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
+      COMP_MUL_4B(0)                                               \
+      CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
+      CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
+      CO[2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
+      CO[3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
+      CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+      CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+      CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+      CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;          \
+      COMP_MUL_4B(16)                                              \
+      CO[4]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
+      CO[5]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
+      CO[6]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
+      CO[7]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
+      CO[2*ldc+4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+      CO[2*ldc+5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+      CO[2*ldc+6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+      CO[2*ldc+7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
+
+#define SAVE_ACC_COMPLEX_22_1                                      \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                \
+      COMP_MUL_4B(0)                                               \
+      CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;          \
+      CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;          \
+      CO[2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;          \
+      CO[3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;          \
+      CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+      CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+      CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+      CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;         
+
+#define SAVE_ACC_COMPLEX_24_ALL                                         \
+      __builtin_mma_disassemble_acc ((void *)result, &acc0);            \
+      __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc4);      \
+      __builtin_mma_disassemble_acc ((void *)(&result[8]), &acc1);      \
+      __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc5);     \
+      __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc2);     \
+      __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc6);     \
+      __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc3);     \
+      __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);     \
+      COMP_MUL(tr[ 0], res[  0], res[  5], ti[ 0], res[  1], res[  4])  \
+      COMP_MUL(tr[ 1], res[  8], res[ 13], ti[ 1], res[  9], res[ 12])  \
+      COMP_MUL(tr[ 2], res[  2], res[  7], ti[ 2], res[  3], res[  6])  \
+      COMP_MUL(tr[ 3], res[ 10], res[ 15], ti[ 3], res[ 11], res[ 14])  \
+      COMP_MUL(tr[ 4], res[ 16], res[ 21], ti[ 4], res[ 17], res[ 20])  \
+      COMP_MUL(tr[ 5], res[ 24], res[ 29], ti[ 5], res[ 25], res[ 28])  \
+      COMP_MUL(tr[ 6], res[ 18], res[ 23], ti[ 6], res[ 19], res[ 22])  \
+      COMP_MUL(tr[ 7], res[ 26], res[ 31], ti[ 7], res[ 27], res[ 30])  \
+      COMP_MUL(tr[ 8], res[ 32], res[ 37], ti[ 8], res[ 33], res[ 36])  \
+      COMP_MUL(tr[ 9], res[ 40], res[ 45], ti[ 9], res[ 41], res[ 44])  \
+      COMP_MUL(tr[10], res[ 34], res[ 39], ti[10], res[ 35], res[ 38])  \
+      COMP_MUL(tr[11], res[ 42], res[ 47], ti[11], res[ 43], res[ 46])  \
+      COMP_MUL(tr[12], res[ 48], res[ 53], ti[12], res[ 49], res[ 52])  \
+      COMP_MUL(tr[13], res[ 56], res[ 61], ti[13], res[ 57], res[ 60])  \
+      COMP_MUL(tr[14], res[ 50], res[ 55], ti[14], res[ 51], res[ 54])  \
+      COMP_MUL(tr[15], res[ 58], res[ 63], ti[15], res[ 59], res[ 62])  \
+      COMP_MUL(tr[16], res[ 64], res[ 69], ti[16], res[ 65], res[ 68])  \
+      COMP_MUL(tr[17], res[ 72], res[ 77], ti[17], res[ 73], res[ 76])  \
+      COMP_MUL(tr[18], res[ 66], res[ 71], ti[18], res[ 67], res[ 70])  \
+      COMP_MUL(tr[19], res[ 74], res[ 79], ti[19], res[ 75], res[ 78])  \
+      COMP_MUL(tr[20], res[ 80], res[ 85], ti[20], res[ 81], res[ 84])  \
+      COMP_MUL(tr[21], res[ 88], res[ 93], ti[21], res[ 89], res[ 92])  \
+      COMP_MUL(tr[22], res[ 82], res[ 87], ti[22], res[ 83], res[ 86])  \
+      COMP_MUL(tr[23], res[ 90], res[ 95], ti[23], res[ 91], res[ 94])  \
+      COMP_MUL(tr[24], res[ 96], res[101], ti[24], res[ 97], res[100])  \
+      COMP_MUL(tr[25], res[104], res[109], ti[25], res[105], res[108])  \
+      COMP_MUL(tr[26], res[ 98], res[103], ti[26], res[ 99], res[102])  \
+      COMP_MUL(tr[27], res[106], res[111], ti[27], res[107], res[110])  \
+      COMP_MUL(tr[28], res[112], res[117], ti[28], res[113], res[116])  \
+      COMP_MUL(tr[29], res[120], res[125], ti[29], res[121], res[124])  \
+      COMP_MUL(tr[30], res[114], res[119], ti[30], res[115], res[118])  \
+      COMP_MUL(tr[31], res[122], res[127], ti[31], res[123], res[126])  \
+      CO[       0] A_OP tr[ 0] * alpha_r - ti[ 0] * alpha_i;            \
+      CO[       1] A_OP ti[ 0] * alpha_r + tr[ 0] * alpha_i;            \
+      CO[       2] A_OP tr[ 1] * alpha_r - ti[ 1] * alpha_i;            \
+      CO[       3] A_OP ti[ 1] * alpha_r + tr[ 1] * alpha_i;            \
+      CO[2*ldc+ 0] A_OP tr[ 2] * alpha_r - ti[ 2] * alpha_i;            \
+      CO[2*ldc+ 1] A_OP ti[ 2] * alpha_r + tr[ 2] * alpha_i;            \
+      CO[2*ldc+ 2] A_OP tr[ 3] * alpha_r - ti[ 3] * alpha_i;            \
+      CO[2*ldc+ 3] A_OP ti[ 3] * alpha_r + tr[ 3] * alpha_i;            \
+      CO[4*ldc+ 0] A_OP tr[ 4] * alpha_r - ti[ 4] * alpha_i;            \
+      CO[4*ldc+ 1] A_OP ti[ 4] * alpha_r + tr[ 4] * alpha_i;            \
+      CO[4*ldc+ 2] A_OP tr[ 5] * alpha_r - ti[ 5] * alpha_i;            \
+      CO[4*ldc+ 3] A_OP ti[ 5] * alpha_r + tr[ 5] * alpha_i;            \
+      CO[6*ldc+ 0] A_OP tr[ 6] * alpha_r - ti[ 6] * alpha_i;            \
+      CO[6*ldc+ 1] A_OP ti[ 6] * alpha_r + tr[ 6] * alpha_i;            \
+      CO[6*ldc+ 2] A_OP tr[ 7] * alpha_r - ti[ 7] * alpha_i;            \
+      CO[6*ldc+ 3] A_OP ti[ 7] * alpha_r + tr[ 7] * alpha_i;            \
+      CO[       4] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;            \
+      CO[       5] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;            \
+      CO[       6] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;            \
+      CO[       7] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;            \
+      CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;            \
+      CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;            \
+      CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;            \
+      CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;            \
+      CO[4*ldc+ 4] A_OP tr[12] * alpha_r - ti[12] * alpha_i;            \
+      CO[4*ldc+ 5] A_OP ti[12] * alpha_r + tr[12] * alpha_i;            \
+      CO[4*ldc+ 6] A_OP tr[13] * alpha_r - ti[13] * alpha_i;            \
+      CO[4*ldc+ 7] A_OP ti[13] * alpha_r + tr[13] * alpha_i;            \
+      CO[6*ldc+ 4] A_OP tr[14] * alpha_r - ti[14] * alpha_i;            \
+      CO[6*ldc+ 5] A_OP ti[14] * alpha_r + tr[14] * alpha_i;            \
+      CO[6*ldc+ 6] A_OP tr[15] * alpha_r - ti[15] * alpha_i;            \
+      CO[6*ldc+ 7] A_OP ti[15] * alpha_r + tr[15] * alpha_i;            \
+      CO[       8] A_OP tr[16] * alpha_r - ti[16] * alpha_i;            \
+      CO[       9] A_OP ti[16] * alpha_r + tr[16] * alpha_i;            \
+      CO[      10] A_OP tr[17] * alpha_r - ti[17] * alpha_i;            \
+      CO[      11] A_OP ti[17] * alpha_r + tr[17] * alpha_i;            \
+      CO[2*ldc+ 8] A_OP tr[18] * alpha_r - ti[18] * alpha_i;            \
+      CO[2*ldc+ 9] A_OP ti[18] * alpha_r + tr[18] * alpha_i;            \
+      CO[2*ldc+10] A_OP tr[19] * alpha_r - ti[19] * alpha_i;            \
+      CO[2*ldc+11] A_OP ti[19] * alpha_r + tr[19] * alpha_i;            \
+      CO[4*ldc+ 8] A_OP tr[20] * alpha_r - ti[20] * alpha_i;            \
+      CO[4*ldc+ 9] A_OP ti[20] * alpha_r + tr[20] * alpha_i;            \
+      CO[4*ldc+10] A_OP tr[21] * alpha_r - ti[21] * alpha_i;            \
+      CO[4*ldc+11] A_OP ti[21] * alpha_r + tr[21] * alpha_i;            \
+      CO[6*ldc+ 8] A_OP tr[22] * alpha_r - ti[22] * alpha_i;            \
+      CO[6*ldc+ 9] A_OP ti[22] * alpha_r + tr[22] * alpha_i;            \
+      CO[6*ldc+10] A_OP tr[23] * alpha_r - ti[23] * alpha_i;            \
+      CO[6*ldc+11] A_OP ti[23] * alpha_r + tr[23] * alpha_i;            \
+      CO[      12] A_OP tr[24] * alpha_r - ti[24] * alpha_i;            \
+      CO[      13] A_OP ti[24] * alpha_r + tr[24] * alpha_i;            \
+      CO[      14] A_OP tr[25] * alpha_r - ti[25] * alpha_i;            \
+      CO[      15] A_OP ti[25] * alpha_r + tr[25] * alpha_i;            \
+      CO[2*ldc+12] A_OP tr[26] * alpha_r - ti[26] * alpha_i;            \
+      CO[2*ldc+13] A_OP ti[26] * alpha_r + tr[26] * alpha_i;            \
+      CO[2*ldc+14] A_OP tr[27] * alpha_r - ti[27] * alpha_i;            \
+      CO[2*ldc+15] A_OP ti[27] * alpha_r + tr[27] * alpha_i;            \
+      CO[4*ldc+12] A_OP tr[28] * alpha_r - ti[28] * alpha_i;            \
+      CO[4*ldc+13] A_OP ti[28] * alpha_r + tr[28] * alpha_i;            \
+      CO[4*ldc+14] A_OP tr[29] * alpha_r - ti[29] * alpha_i;            \
+      CO[4*ldc+15] A_OP ti[29] * alpha_r + tr[29] * alpha_i;            \
+      CO[6*ldc+12] A_OP tr[30] * alpha_r - ti[30] * alpha_i;            \
+      CO[6*ldc+13] A_OP ti[30] * alpha_r + tr[30] * alpha_i;            \
+      CO[6*ldc+14] A_OP tr[31] * alpha_r - ti[31] * alpha_i;            \
+      CO[6*ldc+15] A_OP ti[31] * alpha_r + tr[31] * alpha_i; 
+
+#define SAVE_ACC_COMPLEX_24(ACC1, ACC2, CI)                        \
+      __builtin_mma_disassemble_acc ((void *)result, ACC1);        \
+      __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2);  \
+      COMP_MUL(tr[0], res[0], res[5], ti[0], res[1], res[4])       \
+      COMP_MUL(tr[1], res[8], res[13], ti[1], res[9], res[12])     \
+      COMP_MUL(tr[2], res[2], res[7], ti[2], res[3], res[6])       \
+      COMP_MUL(tr[3], res[10], res[15], ti[3], res[11], res[14])   \
+      COMP_MUL(tr[4], res[16], res[21], ti[4], res[17], res[20])   \
+      COMP_MUL(tr[5], res[24], res[29], ti[5], res[25], res[28])   \
+      COMP_MUL(tr[6], res[18], res[23], ti[6], res[19], res[22])   \
+      COMP_MUL(tr[7], res[26], res[31], ti[7], res[27], res[30])   \
+      CO[CI+0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;       \
+      CO[CI+1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;       \
+      CO[CI+2]       A_OP tr[1] * alpha_r - ti[1] * alpha_i;       \
+      CO[CI+3]       A_OP ti[1] * alpha_r + tr[1] * alpha_i;       \
+      CO[CI+2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;       \
+      CO[CI+2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;       \
+      CO[CI+2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;       \
+      CO[CI+2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;       \
+      CO[CI+4*ldc+0] A_OP tr[4] * alpha_r - ti[4] * alpha_i;       \
+      CO[CI+4*ldc+1] A_OP ti[4] * alpha_r + tr[4] * alpha_i;       \
+      CO[CI+4*ldc+2] A_OP tr[5] * alpha_r - ti[5] * alpha_i;       \
+      CO[CI+4*ldc+3] A_OP ti[5] * alpha_r + tr[5] * alpha_i;       \
+      CO[CI+6*ldc+0] A_OP tr[6] * alpha_r - ti[6] * alpha_i;       \
+      CO[CI+6*ldc+1] A_OP ti[6] * alpha_r + tr[6] * alpha_i;       \
+      CO[CI+6*ldc+2] A_OP tr[7] * alpha_r - ti[7] * alpha_i;       \
+      CO[CI+6*ldc+3] A_OP ti[7] * alpha_r + tr[7] * alpha_i;         
+
+#define SAVE_ACC_COMPLEX_14                                          \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                                  \
+      COMP_MUL(tr[0], res[  0], res[  5], ti[0], res[  1], res[  4]) \
+      COMP_MUL(tr[1], res[  2], res[  7], ti[1], res[  3], res[  6]) \
+      COMP_MUL(tr[2], res[ 16], res[ 21], ti[2], res[ 17], res[ 20]) \
+      COMP_MUL(tr[3], res[ 18], res[ 23], ti[3], res[ 19], res[ 22]) \
+      COMP_MAC(tr[0], res[ 32], res[ 37], ti[0], res[ 33], res[ 36]) \
+      COMP_MAC(tr[1], res[ 34], res[ 39], ti[1], res[ 35], res[ 38]) \
+      COMP_MAC(tr[2], res[ 48], res[ 53], ti[2], res[ 49], res[ 52]) \
+      COMP_MAC(tr[3], res[ 50], res[ 55], ti[3], res[ 51], res[ 54]) \
+      COMP_MAC(tr[0], res[ 64], res[ 69], ti[0], res[ 65], res[ 68]) \
+      COMP_MAC(tr[1], res[ 66], res[ 71], ti[1], res[ 67], res[ 70]) \
+      COMP_MAC(tr[2], res[ 80], res[ 85], ti[2], res[ 81], res[ 84]) \
+      COMP_MAC(tr[3], res[ 82], res[ 87], ti[3], res[ 83], res[ 86]) \
+      COMP_MAC(tr[0], res[ 96], res[101], ti[0], res[ 97], res[100]) \
+      COMP_MAC(tr[1], res[ 98], res[103], ti[1], res[ 99], res[102]) \
+      COMP_MAC(tr[2], res[112], res[117], ti[2], res[113], res[116]) \
+      COMP_MAC(tr[3], res[114], res[119], ti[3], res[115], res[118]) \
+      CO[0]       A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+      CO[1]       A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+      CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+      CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
+      CO[4*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;            \
+      CO[4*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;            \
+      CO[6*ldc+0] A_OP tr[3] * alpha_r - ti[3] * alpha_i;            \
+      CO[6*ldc+1] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+      temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+      temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+      temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y) \
+      BO = B; \
+      REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y) \
+      AO += off * (2*x); \
+      BO = B + off * (2*y); \
+      REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+      off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+      temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+      temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+      temp = k - off;                \
+      UPDATE_TEMP(x, y)              \
+      AO += temp * (2*x);            \
+      BO += temp * (2*y);
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y)      \
+      REFRESH_TMP_AFTER_SAVE(x, y)   \
+      REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+#ifdef TRMMKERNEL
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, 
+       FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset) 
+#else
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, 
+       FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc) 
+#endif
+{
+   BLASLONG i1, i, l, temp;
+   FLOAT *AO, *BO, *CO;
+#if defined(TRMMKERNEL)
+   BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   off = -offset;
+#endif
+
+   __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+   v4sf_t result[32];
+   FLOAT *res, tr[64], ti[64];
+   res = (FLOAT *) result;
+
+   for (i1 = 0; i1 < (n >> 2); i1++) {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc << 3;
+
+      for (i = 0; i < (m >> 3); i++) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (8, 4);
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf32gerpp(&acc4, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc5, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc6, rowA3, rowB2);
+            __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB2);
+         }
+      SAVE_ACC_COMPLEX_24_ALL
+         CO += 16;
+         AO += temp << 4;
+         BO += temp << 3;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (8, 4)
+#endif
+      }
+      if (m & 4) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (4, 4);
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB3);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB4);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB2);
+         }
+         SAVE_ACC_COMPLEX_24(&acc0, &acc2, 0)
+         SAVE_ACC_COMPLEX_24(&acc1, &acc3, 4)
+         CO += 8;
+         AO += temp << 3;
+         BO += temp << 3;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (4, 4)
+#endif
+      }
+      if (m & 2) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (2, 4);
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB3);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB4);
+            __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB5);
+            __builtin_mma_xvf32gerpp(&acc1, rowA3, rowB6);
+            __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB7);
+            __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB8);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+         }
+         SAVE_ACC_COMPLEX_24(&acc0, &acc1, 0)
+         CO += 4;
+         AO += temp << 2;
+         BO += temp << 3;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (2, 4)
+#endif
+      }
+      if (m & 1) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (1, 4)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<3)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<3)+12];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<3)+16];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<3)+20];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<3)+24];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<3)+28];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA2, rowB3);
+            __builtin_mma_xvf32gerpp(&acc3, rowA2, rowB4);
+            __builtin_mma_xvf32gerpp(&acc4, rowA3, rowB5);
+            __builtin_mma_xvf32gerpp(&acc5, rowA3, rowB6);
+            __builtin_mma_xvf32gerpp(&acc6, rowA4, rowB7);
+            __builtin_mma_xvf32gerpp(&acc7, rowA4, rowB8);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowB1 = *(vec_t *) & BO[l<<3];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<3)+4];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA1, rowB2);
+         }
+         SAVE_ACC_COMPLEX_14
+         CO += 2;
+         AO += temp << 1;
+         BO += temp << 3;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (1, 4)
+#endif
+      }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 4;                        // number of values in A
+#endif
+
+      B += k << 3;
+   }
+
+   if (n & 2) {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc << 2;
+
+      for (i = 0; i < (m >> 3); i++) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (8, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB2);
+            __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA7, rowB2);
+            __builtin_mma_xvf32gerpp(&acc3, rowA8, rowB2);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+         }
+         SAVE_ACC_COMPLEX_22_4
+         AO += temp << 4;
+         BO += temp << 2;
+         CO += 16;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (8, 2)
+#endif
+      }
+      if (m & 4) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (4, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB2);
+            __builtin_mma_xvf32gerpp(&acc1, rowA4, rowB2);
+            __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB3);
+            __builtin_mma_xvf32gerpp(&acc1, rowA6, rowB3);
+            __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB4);
+            __builtin_mma_xvf32gerpp(&acc1, rowA8, rowB4);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+         }
+         SAVE_ACC_COMPLEX_22_2
+         AO += temp << 3;
+         BO += temp << 2;
+         CO += 8;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (4, 2)
+#endif
+      } 
+      if (m & 2) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (2, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc0, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc0, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc0, rowA4, rowB4);
+            __builtin_mma_xvf32gerpp(&acc0, rowA5, rowB5);
+            __builtin_mma_xvf32gerpp(&acc0, rowA6, rowB6);
+            __builtin_mma_xvf32gerpp(&acc0, rowA7, rowB7);
+            __builtin_mma_xvf32gerpp(&acc0, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_22_1
+         AO += temp << 2;
+         BO += temp << 2;
+         CO += 4;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (2, 2)
+#endif
+      }
+      if (m & 1) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (1, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+12];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<2)+16];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<2)+20];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<2)+24];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<2)+28];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_12
+         AO += temp<<1;
+         BO += temp<<2;
+         CO += 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (1, 2)
+#endif
+      }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;                        // number of values in A
+#endif
+      B += k << 2;
+   }
+
+   if (n & 1) {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc << 1;
+
+      for (i = 0; i < (m >> 3); i++) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (8, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<4)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<4)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<4)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<4)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB2);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB2);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB2);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB2);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<4];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<4)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<4)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<4)+12];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_4
+         AO += temp << 4;
+         BO += temp << 1;
+         CO += 16;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (8, 1)
+#endif
+      }
+      if (m & 4) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (4, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<3)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<3)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<3)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<3)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<3)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<3)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB2);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB2);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB3);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB3);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB4);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB4);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<3];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<3)+4];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_2
+         AO += temp << 3;
+         BO += temp << 1;
+         CO += 8;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (4, 1)
+#endif
+      }
+      if (m & 2) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (2, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<2)+4];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<2)+8];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<2)+12];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<2)+16];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<2)+20];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<2)+24];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<2)+28];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<2];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_1
+         AO += temp << 2;
+         BO += temp << 1;
+         CO += 4;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (2, 1)
+#endif
+      }
+      if (m & 1) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (1, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowA2 = *(vec_t *) & AO[(l<<1)+2];
+            vec_t rowA3 = *(vec_t *) & AO[(l<<1)+4];
+            vec_t rowA4 = *(vec_t *) & AO[(l<<1)+6];
+            vec_t rowA5 = *(vec_t *) & AO[(l<<1)+8];
+            vec_t rowA6 = *(vec_t *) & AO[(l<<1)+10];
+            vec_t rowA7 = *(vec_t *) & AO[(l<<1)+12];
+            vec_t rowA8 = *(vec_t *) & AO[(l<<1)+14];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf32gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf32gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf32gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf32gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf32gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf32gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf32gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            vec_t rowA1 = *(vec_t *) & AO[l<<1];
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf32gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_11
+         AO += temp<<1;
+         BO += temp<<1;
+         CO += 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (1, 1)
+#endif
+      }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;                        // number of values in A
+#endif
+      B += k << 1;
+   }
+   return 0;
+}
diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S
index c115650fd..74117a831 100644
--- a/kernel/power/cnrm2.S
+++ b/kernel/power/cnrm2.S
@@ -99,7 +99,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	fmr	f0,  f1
 	fmr	f2,  f1
diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S
index 46c29c654..0d036b32f 100644
--- a/kernel/power/cnrm2_hummer.S
+++ b/kernel/power/cnrm2_hummer.S
@@ -119,7 +119,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	andi.	r0, X, 2 * SIZE - 1
 	bne	LL(100)
diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S
index c71c34b7c..8e3abf9f9 100644
--- a/kernel/power/cnrm2_ppc440.S
+++ b/kernel/power/cnrm2_ppc440.S
@@ -104,7 +104,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	fmr	f0,  f1
 	sub	X, X, INCX
diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c
index ecdc3e5c6..73f6d5b99 100644
--- a/kernel/power/dgemm_small_kernel_nn_power10.c
+++ b/kernel/power/dgemm_small_kernel_nn_power10.c
@@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t;
 
 #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
 
-#if (defined(__GNUC__) && (__GNUC__ == 10))
+#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)))
 #if defined(_AIX)
 #define LOAD_PAIR(pair, v0, v1)                             \
   __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c
index 93a942b02..426948185 100644
--- a/kernel/power/dgemm_small_kernel_tn_power10.c
+++ b/kernel/power/dgemm_small_kernel_tn_power10.c
@@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t;
 
 #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0);
 
-#if (defined(__GNUC__) && (__GNUC__ == 10))
+#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2)))
 #if defined(_AIX)
 #define LOAD_PAIR(pair, v0, v1)                             \
   __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1);
diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S
index fc52e0202..7a3c04e8e 100644
--- a/kernel/power/dgemm_tcopy_16_power8.S
+++ b/kernel/power/dgemm_tcopy_16_power8.S
@@ -107,6 +107,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define o0	0
 
+#ifdef POWER10
+#include "dgemm_tcopy_macros_16_power10.S"
+#endif
 #include "dgemm_tcopy_macros_16_power8.S"
 
 #define STACKSIZE 144
diff --git a/kernel/power/dgemm_tcopy_macros_16_power10.S b/kernel/power/dgemm_tcopy_macros_16_power10.S
new file mode 100644
index 000000000..3cb416e09
--- /dev/null
+++ b/kernel/power/dgemm_tcopy_macros_16_power10.S
@@ -0,0 +1,655 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x16', `
+#else
+.macro COPY_4x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+    lxvpx       vs40,   o0,     A1
+    lxvpx       vs42,   o32,    A1
+	addi		A1,	A1,	64
+
+    lxvpx       vs48,   o0,     A2
+    lxvpx       vs50,   o32,    A2
+	addi		A2,	A2,	64
+
+    lxvpx       vs4,    o0,     A3
+    lxvpx       vs6,    o32,    A3
+	addi		A3,	A3,	64
+
+    lxvpx       vs36,   o0,     A0
+    lxvpx       vs38,   o32,    A0
+	addi		A0,	A0,	64
+
+    lxvpx       vs44,   o0,     A1
+    lxvpx       vs46,   o32,    A1
+	addi		A1,	A1,	64
+
+    lxvpx       vs12,   o0,     A2
+    lxvpx       vs2,    o32,    A2
+	addi		A2,	A2,	64
+
+    lxvpx       vs8,    o0,     A3
+    lxvpx       vs10,   o32,    A3
+	addi		A3,	A3,	64
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs40,	o0,	T1
+	stxvd2x		vs41,	o16,	T1
+	stxvd2x		vs42,	o32,	T1
+	stxvd2x		vs43,	o48,	T1
+#else
+    stxvd2x     vs41,   o0, T1
+    stxvd2x     vs40,   o16,    T1
+    stxvd2x     vs43,   o32,    T1
+    stxvd2x     vs42,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs44,	o0,	T1
+	stxvd2x		vs45,	o16,	T1
+	stxvd2x		vs46,	o32,	T1
+	stxvd2x		vs47,	o48,	T1
+#else
+    stxvd2x     vs45,   o0, T1
+    stxvd2x     vs44,   o16,    T1
+    stxvd2x     vs47,   o32,    T1
+    stxvd2x     vs46,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs48,	o0,	T1
+	stxvd2x		vs49,	o16,	T1
+	stxvd2x		vs50,	o32,	T1
+	stxvd2x		vs51,	o48,	T1
+#else
+    stxvd2x     vs49,   o0, T1
+    stxvd2x     vs48,   o16,    T1
+    stxvd2x     vs51,   o32,    T1
+    stxvd2x     vs50,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs12,	o0,	T1
+	stxvd2x		vs13,	o16,	T1
+	stxvd2x		vs2,	o32,	T1
+	stxvd2x		vs3,	o48,	T1
+#else
+    stxvd2x     vs13,   o0, T1
+    stxvd2x     vs12,   o16,    T1
+    stxvd2x     vs3,    o32,    T1
+    stxvd2x     vs2,    o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs4,	o0,	T1
+	stxvd2x		vs5,	o16,	T1
+	stxvd2x		vs6,	o32,	T1
+	stxvd2x		vs7,	o48,	T1
+#else
+    stxvd2x     vs5,    o0, T1
+    stxvd2x     vs4,    o16,    T1
+    stxvd2x     vs7,    o32,    T1
+    stxvd2x     vs6,    o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs8,	o0,	T1
+	stxvd2x		vs9,	o16,	T1
+	stxvd2x		vs10,	o32,	T1
+	stxvd2x		vs11,	o48,	T1
+#else
+    stxvd2x     vs9,    o0, T1
+    stxvd2x     vs8,    o16,    T1
+    stxvd2x     vs11,   o32,    T1
+    stxvd2x     vs10,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
+.macro COPY_4x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o32,    A1
+	addi		A1,	A1,	64
+
+
+    lxvpx       vs40,   o0,     A2
+    lxvpx       vs42,   o32,    A2
+	addi		A2,	A2,	64
+
+
+    lxvpx       vs44,   o0,     A3
+    lxvpx       vs46,   o32,    A3
+	addi		A3,	A3,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs40,	o0,	T1
+	stxvd2x		vs41,	o16,	T1
+	stxvd2x		vs42,	o32,	T1
+	stxvd2x		vs43,	o48,	T1
+#else
+    stxvd2x     vs41,   o0, T1
+    stxvd2x     vs40,   o16,    T1
+    stxvd2x     vs43,   o32,    T1
+    stxvd2x     vs42,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs44,	o0,	T1
+	stxvd2x		vs45,	o16,	T1
+	stxvd2x		vs46,	o32,	T1
+	stxvd2x		vs47,	o48,	T1
+#else
+    stxvd2x     vs45,   o0, T1
+    stxvd2x     vs44,   o16,    T1
+    stxvd2x     vs47,   o32,    T1
+    stxvd2x     vs46,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=4
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x4', `
+#else
+.macro COPY_4x4
+#endif
+
+    lxvpx       vs32,   o0,     A0
+	addi		A0,	A0,	32
+
+
+    lxvpx       vs34,   o0,     A1
+	addi		A1,	A1,	32
+
+
+    lxvpx       vs36,   o0,     A2
+	addi		A2,	A2,	32
+
+
+    lxvpx       vs38,   o0,     A3
+	addi		A3,	A3,	32
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x16', `
+#else
+.macro COPY_2x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+    lxvpx       vs36,   o0,     A0
+    lxvpx       vs38,   o32,    A0
+	addi		A0,	A0,	64
+
+
+    lxvpx       vs40,   o0,     A1
+    lxvpx       vs42,   o32,    A1
+	addi		A1,	A1,	64
+
+    lxvpx       vs44,   o0,     A1
+    lxvpx       vs46,   o32,    A1
+	addi		A1,	A1,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs40,	o0,	T1
+	stxvd2x		vs41,	o16,	T1
+	stxvd2x		vs42,	o32,	T1
+	stxvd2x		vs43,	o48,	T1
+#else
+    stxvd2x     vs41,   o0, T1
+    stxvd2x     vs40,   o16,    T1
+    stxvd2x     vs43,   o32,    T1
+    stxvd2x     vs42,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs44,	o0,	T1
+	stxvd2x		vs45,	o16,	T1
+	stxvd2x		vs46,	o32,	T1
+	stxvd2x		vs47,	o48,	T1
+#else
+    stxvd2x     vs45,   o0, T1
+    stxvd2x     vs44,   o16,    T1
+    stxvd2x     vs47,   o32,    T1
+    stxvd2x     vs46,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
+.macro COPY_2x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o0,     A1
+	addi		A1,	A1,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=2 and M=4
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x4', `
+#else
+.macro COPY_2x4
+#endif
+
+    lxvpx       vs32,   o0,     A0
+	addi		A0,	A0,	32
+
+
+    lxvpx       vs34,   o0,     A1
+	addi		A1,	A1,	32
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x16', `
+#else
+.macro COPY_1x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+    lxvpx       vs36,   o0,     A0
+    lxvpx       vs38,   o0,     A0
+	addi		A0,	A0,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs36,	o0,	T1
+	stxvd2x		vs37,	o16,	T1
+	stxvd2x		vs38,	o32,	T1
+	stxvd2x		vs39,	o48,	T1
+#else
+    stxvd2x     vs37,   o0, T1
+    stxvd2x     vs36,   o16,    T1
+    stxvd2x     vs39,   o32,    T1
+    stxvd2x     vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
+.macro COPY_1x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+	addi		A0,	A0,	64
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+	stxvd2x		vs34,	o32,	T1
+	stxvd2x		vs35,	o48,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+    stxvd2x     vs35,   o32,    T1
+    stxvd2x     vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+
+/**********************************************************************************************
+* Macros for N=1 and M=4
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x4', `
+#else
+.macro COPY_1x4
+#endif
+
+    lxvpx       vs32,   o0,     A0
+	addi		A0,	A0,	32
+
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+	stxvd2x		vs32,	o0,	T1
+	stxvd2x		vs33,	o16,	T1
+#else
+    stxvd2x     vs33,   o0, T1
+    stxvd2x     vs32,   o16,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S
index 6c5b8ed62..93b488fd0 100644
--- a/kernel/power/dgemm_tcopy_macros_16_power8.S
+++ b/kernel/power/dgemm_tcopy_macros_16_power8.S
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_4x16', `
 #else
@@ -275,6 +276,7 @@ define(`COPY_4x4', `
 #else
 .endm
 #endif
+#endif
 
 
 /**********************************************************************************************
@@ -369,6 +371,7 @@ define(`COPY_4x1', `
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_2x16', `
 #else
@@ -512,6 +515,7 @@ define(`COPY_2x4', `
 #else
 .endm
 #endif
+#endif
 
 
 /**********************************************************************************************
@@ -580,6 +584,7 @@ define(`COPY_2x1', `
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_1x16', `
 #else
@@ -675,6 +680,7 @@ define(`COPY_1x4', `
 #else
 .endm
 #endif
+#endif
 
 
 /**********************************************************************************************
diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S
index 4931f5ab1..8638ca424 100644
--- a/kernel/power/dnrm2_hummer.S
+++ b/kernel/power/dnrm2_hummer.S
@@ -134,7 +134,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	mr	XX, X
 
diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S
index 849ca1f35..529f6adf0 100644
--- a/kernel/power/dnrm2_ppc440.S
+++ b/kernel/power/dnrm2_ppc440.S
@@ -111,7 +111,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/power/lock.c b/kernel/power/lock.c
index 1c1b006b0..de28680d8 100644
--- a/kernel/power/lock.c
+++ b/kernel/power/lock.c
@@ -36,7 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-static void __inline blas_lock(volatile BLASULONG *address){
+static __inline void blas_lock(volatile BLASULONG *address){
 
 #ifdef __GNUC__
 
diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S
index d9e1f4e9a..880b5d1b4 100644
--- a/kernel/power/nrm2.S
+++ b/kernel/power/nrm2.S
@@ -113,7 +113,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/power/sbgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c
index 134929ec1..c3fa67cf6 100644
--- a/kernel/power/sbgemm_kernel_power10.c
+++ b/kernel/power/sbgemm_kernel_power10.c
@@ -336,7 +336,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
       if (m & 1)
 	{
 	  IFLOAT *BO = B;
-	  v2sf_t *rowC;
 	  v4sf_t result[4], res[4];
 	  __vector_quad acc0, acc1;
 	  __builtin_mma_xxsetaccz (&acc0);
@@ -492,7 +491,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
 	      if (k > 1)
 		l = (k / 2) << 3;
 	      vec_t *rowA = (vec_t *) & (AO[l << 1]);
-	      vec_t *rowB = (vec_t *) & (BO[l]);
 	      vector short rowB_mrg =
 		{ BO[l], 0, BO[l + 1], 0, BO[l + 2], 0, BO[l + 3], 0 };
 	      MMA (&acc0, (vec_t)rowB_mrg, MERGE_HIGH (rowA[0], vzero));
@@ -570,7 +568,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
       if (m & 1)
 	{
 	  IFLOAT *BO = B;
-	  v2sf_t *rowC;
 	  v4sf_t result[4], res[4];
 	  __vector_quad acc0;
 	  BLASLONG l = 0;
diff --git a/kernel/power/sgemm_ncopy_16_power.c b/kernel/power/sgemm_ncopy_16_power.c
new file mode 100755
index 000000000..babe1376e
--- /dev/null
+++ b/kernel/power/sgemm_ncopy_16_power.c
@@ -0,0 +1,482 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <altivec.h>
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+  BLASLONG i, j;
+
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+  IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
+  IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
+
+  IFLOAT *boffset;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
+  IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
+  IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
+  IFLOAT ctemp17,  ctemp19 ;
+  IFLOAT ctemp21,  ctemp23 ;
+  IFLOAT ctemp25,  ctemp27 ;
+  IFLOAT ctemp29,  ctemp31 ;
+
+  aoffset = a;
+  boffset = b;
+  j = (n >> 4);
+  if (j > 0){
+    do{
+      aoffset1  = aoffset;
+      aoffset2  = aoffset1  + lda;
+      aoffset3  = aoffset2  + lda;
+      aoffset4  = aoffset3  + lda;
+      aoffset5  = aoffset4  + lda;
+      aoffset6  = aoffset5  + lda;
+      aoffset7  = aoffset6  + lda;
+      aoffset8  = aoffset7  + lda;
+      aoffset9  = aoffset8  + lda;
+      aoffset10 = aoffset9  + lda;
+      aoffset11 = aoffset10 + lda;
+      aoffset12 = aoffset11 + lda;
+      aoffset13 = aoffset12 + lda;
+      aoffset14 = aoffset13 + lda;
+      aoffset15 = aoffset14 + lda;
+      aoffset16 = aoffset15 + lda;
+      aoffset += 16 * lda;
+      i = (m >> 2);
+      if (i > 0){
+	vector float c1, c2, c3, c4, c5, c6, c7, c8;
+	vector float c9, c10, c11, c12, c13, c14, c15, c16;
+	vector float t1, t2, t3, t4, t5, t6, t7, t8;
+	vector float t9, t10, t11, t12;
+	do{
+	   c1 = vec_xl(0, aoffset1);
+	   c2 = vec_xl(0, aoffset2);
+	   c3 = vec_xl(0, aoffset3);
+	   c4 = vec_xl(0, aoffset4);
+	   c5 = vec_xl(0, aoffset5);
+	   c6 = vec_xl(0, aoffset6);
+	   c7 = vec_xl(0, aoffset7);
+	   c8 = vec_xl(0, aoffset8);
+	   c9 = vec_xl(0, aoffset9);
+	   c10 = vec_xl(0, aoffset10);
+	   c11 = vec_xl(0, aoffset11);
+	   c12 = vec_xl(0, aoffset12);
+	   c13 = vec_xl(0, aoffset13);
+	   c14 = vec_xl(0, aoffset14);
+	   c15 = vec_xl(0, aoffset15);
+	   c16 = vec_xl(0, aoffset16);
+
+           t1  = vec_mergeh(c1, c2);
+           t2  = vec_mergeh(c3, c4);
+           t3  = vec_mergeh(c5, c6);
+           t4  = vec_mergeh(c7, c8);
+           t9  = vec_mergeh(c9, c10);
+           t10  = vec_mergeh(c11, c12);
+           t11  = vec_mergeh(c13, c14);
+           t12  = vec_mergeh(c15, c16);
+
+	   t5 = vec_xxpermdi(t1, t2, 0b00);
+           t6 = vec_xxpermdi(t3, t4, 0b00);
+	   t7 = vec_xxpermdi(t9, t10, 0b00);
+	   t8 = vec_xxpermdi(t11, t12, 0b00);
+
+	   vec_xst(t5, 0, boffset);
+	   vec_xst(t6, 0, boffset+4);
+	   vec_xst(t7, 0, boffset+8);
+	   vec_xst(t8, 0, boffset+12);
+	   t5 = vec_xxpermdi(t1, t2, 0b11);
+	   t6 = vec_xxpermdi(t3, t4, 0b11);
+	   t7 = vec_xxpermdi(t9, t10, 0b11);
+	   t8 = vec_xxpermdi(t11, t12, 0b11);
+	   vec_xst(t5, 0, boffset+16);
+	   vec_xst(t6, 0, boffset+20);
+	   vec_xst(t7, 0, boffset+24);
+	   vec_xst(t8, 0, boffset+28);
+
+           t1  = vec_mergel(c1, c2);
+           t2  = vec_mergel(c3, c4);
+           t3  = vec_mergel(c5, c6);
+           t4  = vec_mergel(c7, c8);
+           t9  = vec_mergel(c9, c10);
+           t10  = vec_mergel(c11, c12);
+           t11  = vec_mergel(c13, c14);
+           t12  = vec_mergel(c15, c16);
+  	   t5 = vec_xxpermdi(t1, t2, 0b00);
+	   t6 = vec_xxpermdi(t3, t4, 0b00);
+	   t7 = vec_xxpermdi(t9, t10, 0b00);
+	   t8 = vec_xxpermdi(t11, t12, 0b00);
+	   vec_xst(t5, 0, boffset+32);
+	   vec_xst(t6, 0, boffset+36);
+	   vec_xst(t7, 0, boffset+40);
+	   vec_xst(t8, 0, boffset+44);
+
+	   t5 = vec_xxpermdi(t1, t2, 0b11);
+	   t6 = vec_xxpermdi(t3, t4, 0b11);
+	   t7 = vec_xxpermdi(t9, t10, 0b11);
+	   t8 = vec_xxpermdi(t11, t12, 0b11);
+	   vec_xst(t5, 0, boffset+48);
+	   vec_xst(t6, 0, boffset+52);
+	   vec_xst(t7, 0, boffset+56);
+	   vec_xst(t8, 0, boffset+60);
+
+	  aoffset1 +=  4;
+	  aoffset2 +=  4;
+	  aoffset3 +=  4;
+	  aoffset4 +=  4;
+	  aoffset5 +=  4;
+	  aoffset6 +=  4;
+	  aoffset7 +=  4;
+	  aoffset8 +=  4;
+
+	  aoffset9  +=  4;
+	  aoffset10 +=  4;
+	  aoffset11 +=  4;
+	  aoffset12 +=  4;
+	  aoffset13 +=  4;
+	  aoffset14 +=  4;
+	  aoffset15 +=  4;
+	  aoffset16 +=  4;
+	  boffset   += 64;
+
+	  i --;
+	}while(i > 0);
+      }
+      i = (m & 3);
+      if (i > 0){
+        do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp03 = *(aoffset2 +  0);
+	ctemp05 = *(aoffset3 +  0);
+	ctemp07 = *(aoffset4 +  0);
+	ctemp09 = *(aoffset5 +  0);
+	ctemp11 = *(aoffset6 +  0);
+	ctemp13 = *(aoffset7 +  0);
+	ctemp15 = *(aoffset8 +  0);
+
+	ctemp17 = *(aoffset9 +  0);
+	ctemp19 = *(aoffset10 +  0);
+	ctemp21 = *(aoffset11 +  0);
+	ctemp23 = *(aoffset12 +  0);
+	ctemp25 = *(aoffset13 +  0);
+	ctemp27 = *(aoffset14 +  0);
+	ctemp29 = *(aoffset15 +  0);
+	ctemp31 = *(aoffset16 +  0);
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp03;
+	*(boffset +  2) = ctemp05;
+	*(boffset +  3) = ctemp07;
+	*(boffset +  4) = ctemp09;
+	*(boffset +  5) = ctemp11;
+	*(boffset +  6) = ctemp13;
+	*(boffset +  7) = ctemp15;
+
+	*(boffset +  8) = ctemp17;
+	*(boffset +  9) = ctemp19;
+	*(boffset + 10) = ctemp21;
+	*(boffset + 11) = ctemp23;
+	*(boffset + 12) = ctemp25;
+	*(boffset + 13) = ctemp27;
+	*(boffset + 14) = ctemp29;
+	*(boffset + 15) = ctemp31;
+	  aoffset1+=1;
+	  aoffset2+=1;
+	  aoffset3+=1;
+	  aoffset4+=1;
+	  aoffset5+=1;
+	  aoffset6+=1;
+	  aoffset7+=1;
+	  aoffset8+=1;
+	  aoffset9+=1;
+	  aoffset10+=1;
+	  aoffset11+=1;
+	  aoffset12+=1;
+	  aoffset13+=1;
+	  aoffset14+=1;
+	  aoffset15+=1;
+	  aoffset16+=1;
+	boffset  += 16;
+        i --;
+        }while(i > 0);
+      }
+      j--;
+    }while(j > 0);
+  } /* end of if(j > 0) */
+
+  if (n & 8){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1  + lda;
+    aoffset3  = aoffset2  + lda;
+    aoffset4  = aoffset3  + lda;
+    aoffset5  = aoffset4  + lda;
+    aoffset6  = aoffset5  + lda;
+    aoffset7  = aoffset6  + lda;
+    aoffset8  = aoffset7  + lda;
+    aoffset += 8 * lda;
+
+    i = (m >> 2);
+    if (i > 0){
+      vector float c1, c2, c3, c4, c5, c6, c7, c8;
+      vector float t1, t2, t3, t4, t5, t6, t7, t8;
+      do{
+        c1 = vec_xl(0, aoffset1);
+        c2 = vec_xl(0, aoffset2);
+        c3 = vec_xl(0, aoffset3);
+        c4 = vec_xl(0, aoffset4);
+        c5 = vec_xl(0, aoffset5);
+        c6 = vec_xl(0, aoffset6);
+        c7 = vec_xl(0, aoffset7);
+        c8 = vec_xl(0, aoffset8);
+
+        t1  = vec_mergeh(c1, c2);
+        t2  = vec_mergeh(c3, c4);
+        t3  = vec_mergeh(c5, c6);
+        t4  = vec_mergeh(c7, c8);
+
+        t5 = vec_xxpermdi(t1, t2, 0b00);
+        t6 = vec_xxpermdi(t3, t4, 0b00);
+        t7 = vec_xxpermdi(t1, t2, 0b11);
+        t8 = vec_xxpermdi(t3, t4, 0b11);
+
+        vec_xst(t5, 0, boffset);
+        vec_xst(t6, 0, boffset+4);
+        vec_xst(t7, 0, boffset+8);
+        vec_xst(t8, 0, boffset+12);
+
+        t1  = vec_mergel(c1, c2);
+        t2  = vec_mergel(c3, c4);
+        t3  = vec_mergel(c5, c6);
+        t4  = vec_mergel(c7, c8);
+
+        t5 = vec_xxpermdi(t1, t2, 0b00);
+        t6 = vec_xxpermdi(t3, t4, 0b00);
+        t7 = vec_xxpermdi(t1, t2, 0b11);
+        t8 = vec_xxpermdi(t3, t4, 0b11);
+
+        vec_xst(t5, 0, boffset+16);
+        vec_xst(t6, 0, boffset+20);
+        vec_xst(t7, 0, boffset+24);
+        vec_xst(t8, 0, boffset+28);
+
+        aoffset1 +=  4;
+        aoffset2 +=  4;
+        aoffset3 +=  4;
+        aoffset4 +=  4;
+        aoffset5 +=  4;
+        aoffset6 +=  4;
+        aoffset7 +=  4;
+        aoffset8 +=  4;
+
+        boffset   += 32;
+        i--;
+      }while(i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0) {
+      do {
+        ctemp01 = *(aoffset1 +  0);
+        ctemp03 = *(aoffset2 +  0);
+        ctemp05 = *(aoffset3 +  0);
+        ctemp07 = *(aoffset4 +  0);
+        ctemp09 = *(aoffset5 +  0);
+        ctemp11 = *(aoffset6 +  0);
+        ctemp13 = *(aoffset7 +  0);
+        ctemp15 = *(aoffset8 +  0);
+
+        *(boffset +  0) = ctemp01;
+        *(boffset +  1) = ctemp03;
+        *(boffset +  2) = ctemp05;
+        *(boffset +  3) = ctemp07;
+        *(boffset +  4) = ctemp09;
+        *(boffset +  5) = ctemp11;
+        *(boffset +  6) = ctemp13;
+        *(boffset +  7) = ctemp15;
+
+        aoffset1+=1;
+        aoffset2+=1;
+        aoffset3+=1;
+        aoffset4+=1;
+        aoffset5+=1;
+        aoffset6+=1;
+        aoffset7+=1;
+        aoffset8+=1;
+
+        boffset   += 8;
+        i--;
+      } while (i > 0);
+    }
+  }
+
+  if (n & 4){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1  + lda;
+    aoffset3  = aoffset2  + lda;
+    aoffset4  = aoffset3  + lda;
+    aoffset += 4 * lda;
+
+    i = (m >> 2);
+    if (i > 0){
+      vector float c1, c2, c3, c4;
+      vector float t1, t2, t3, t4;
+      do{
+        c1 = vec_xl(0, aoffset1);
+        c2 = vec_xl(0, aoffset2);
+        c3 = vec_xl(0, aoffset3);
+        c4 = vec_xl(0, aoffset4);
+
+        t1  = vec_mergeh(c1, c2);
+        t2  = vec_mergeh(c3, c4);
+
+        t3 = vec_xxpermdi(t1, t2, 0b00);
+        t4 = vec_xxpermdi(t1, t2, 0b11);
+
+        vec_xst(t3, 0, boffset);
+        vec_xst(t4, 0, boffset+4);
+
+        t1  = vec_mergel(c1, c2);
+        t2  = vec_mergel(c3, c4);
+
+        t3 = vec_xxpermdi(t1, t2, 0b00);
+        t4 = vec_xxpermdi(t1, t2, 0b11);
+
+        vec_xst(t3, 0, boffset+8);
+        vec_xst(t4, 0, boffset+12);
+
+        aoffset1 +=  4;
+        aoffset2 +=  4;
+        aoffset3 +=  4;
+        aoffset4 +=  4;
+
+        boffset   += 16;
+        i--;
+      }while(i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0) {
+      do {
+        ctemp01 = *(aoffset1 +  0);
+        ctemp03 = *(aoffset2 +  0);
+        ctemp05 = *(aoffset3 +  0);
+        ctemp07 = *(aoffset4 +  0);
+
+        *(boffset +  0) = ctemp01;
+        *(boffset +  1) = ctemp03;
+        *(boffset +  2) = ctemp05;
+        *(boffset +  3) = ctemp07;
+
+        aoffset1+=1;
+        aoffset2+=1;
+        aoffset3+=1;
+        aoffset4+=1;
+
+        boffset   += 4;
+        i--;
+      } while (i > 0);
+    }
+  }
+
+  if (n & 2){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1  + lda;
+    aoffset += 2 * lda;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+	ctemp03 = *(aoffset2 +  0);
+	ctemp04 = *(aoffset2 +  1);
+
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp03;
+	*(boffset +  2) = ctemp02;
+	*(boffset +  3) = ctemp04;
+
+	aoffset1 +=  2;
+	aoffset2 +=  2;
+	boffset   += 4;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp03 = *(aoffset2 +  0);
+
+      *(boffset +  0) = ctemp01;
+      *(boffset +  1) = ctemp03;
+      boffset   += 2;
+    }
+  }
+
+  if (n & 1){
+    aoffset1  = aoffset;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset1 +  1);
+
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp02;
+
+	aoffset1 +=  2;
+	boffset   += 2;
+
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+
+      *(boffset +  0) = ctemp01;
+      // boffset   += 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S
index b9f6d63fb..6d2c6a555 100644
--- a/kernel/power/sgemm_tcopy_16_power8.S
+++ b/kernel/power/sgemm_tcopy_16_power8.S
@@ -108,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define o0	0
 
+#ifdef POWER10
+#include "sgemm_tcopy_macros_16_power10.S"
+#endif
 #include "sgemm_tcopy_macros_16_power8.S"
 
 #define STACKSIZE 144
diff --git a/kernel/power/sgemm_tcopy_macros_16_power10.S b/kernel/power/sgemm_tcopy_macros_16_power10.S
new file mode 100644
index 000000000..dca37e48a
--- /dev/null
+++ b/kernel/power/sgemm_tcopy_macros_16_power10.S
@@ -0,0 +1,323 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2016/04/21 Werner Saar (wernsaar@googlemail.com)
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*	 LAPACK-TEST		: OK
+**************************************************************************************/
+
+
+/**********************************************************************************************
+* Macros for N=4 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x16', `
+#else
+.macro COPY_4x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o32,    A1
+
+    lxvpx       vs40,   o0,     A2
+    lxvpx       vs42,   o32,    A2
+
+    lxvpx       vs44,   o0,     A3
+    lxvpx       vs46,   o32,    A3
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs36,   o0,     T1
+    stxvx       vs37,   o16,    T1
+    stxvx       vs38,   o32,    T1
+    stxvx       vs39,   o48,    T1
+#else
+    stxvx       vs37,   o0,     T1
+    stxvx       vs36,   o16,    T1
+    stxvx       vs39,   o32,    T1
+    stxvx       vs38,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs40,   o0,     T1
+    stxvx       vs41,   o16,    T1
+    stxvx       vs42,   o32,    T1
+    stxvx       vs43,   o48,    T1
+#else
+    stxvx       vs41,   o0,     T1
+    stxvx       vs40,   o16,    T1
+    stxvx       vs43,   o32,    T1
+    stxvx       vs42,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs44,   o0,     T1
+    stxvx       vs45,   o16,    T1
+    stxvx       vs46,   o32,    T1
+    stxvx       vs47,   o48,    T1
+#else
+    stxvx       vs45,   o0,     T1
+    stxvx       vs44,   o16,    T1
+    stxvx       vs47,   o32,    T1
+    stxvx       vs46,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=4 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
+.macro COPY_4x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+
+    lxvpx       vs34,   o0,     A1
+
+    lxvpx       vs36,   o0,     A2
+
+    lxvpx       vs38,   o0,     A3
+
+    mr      T1, BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+    addi        T1, T1, 64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs36,   o0,     T1
+    stxvx       vs37,   o16,    T1
+
+    stxvx       vs38,   o32,    T1
+    stxvx       vs39,   o48,    T1
+#else
+    stxvx       vs37,   o0,     T1
+    stxvx       vs36,   o16,    T1
+
+    stxvx       vs39,   o32,    T1
+    stxvx       vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x16', `
+#else
+.macro COPY_2x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+
+    lxvpx       vs36,   o0,     A1
+    lxvpx       vs38,   o32,    A1
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+	addi		T1,	T1,	64
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs36,   o0, T1
+    stxvx       vs37,   o16,    T1
+    stxvx       vs38,   o32,    T1
+    stxvx       vs39,   o48,    T1
+#else
+    stxvx       vs37,   o0, T1
+    stxvx       vs36,   o16,    T1
+    stxvx       vs39,   o32,    T1
+    stxvx       vs38,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=2 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
+.macro COPY_2x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+
+    lxvpx       vs34,   o0,     A1
+
+    mr      T1, BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0,     T1
+    stxvx       vs33,   o16,    T1
+
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0,     T1
+    stxvx       vs32,   o16,    T1
+
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=1 and M=16
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x16', `
+#else
+.macro COPY_1x16
+#endif
+
+    lxvpx       vs32,   o0,     A0
+    lxvpx       vs34,   o32,    A0
+
+	mr		T1,	BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0, T1
+    stxvx       vs33,   o16,    T1
+    stxvx       vs34,   o32,    T1
+    stxvx       vs35,   o48,    T1
+#else
+    stxvx       vs33,   o0, T1
+    stxvx       vs32,   o16,    T1
+    stxvx       vs35,   o32,    T1
+    stxvx       vs34,   o48,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
+/**********************************************************************************************
+* Macros for N=1 and M=8
+**********************************************************************************************/
+
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
+.macro COPY_1x8
+#endif
+
+    lxvpx       vs32,   o0,     A0
+
+    mr      T1, BO
+
+#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+    stxvx       vs32,   o0, T1
+    stxvx       vs33,   o16,    T1
+#else
+    stxvx       vs33,   o0, T1
+    stxvx       vs32,   o16,    T1
+#endif
+
+#if defined(_AIX)
+')
+#else
+.endm
+#endif
+
diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S
index ed592a604..af237d5ee 100644
--- a/kernel/power/sgemm_tcopy_macros_16_power8.S
+++ b/kernel/power/sgemm_tcopy_macros_16_power8.S
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_4x16', `
 #else
@@ -141,6 +142,7 @@ define(`COPY_4x8', `
 #else
 .endm
 #endif
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
@@ -264,6 +266,7 @@ define(`COPY_4x1', `
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_2x16', `
 #else
@@ -329,6 +332,7 @@ define(`COPY_2x8', `
 #else
 .endm
 #endif
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
@@ -418,6 +422,7 @@ define(`COPY_2x1', `
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#ifndef POWER10
 #if defined(_AIX)
 define(`COPY_1x16', `
 #else
@@ -465,6 +470,7 @@ define(`COPY_1x8', `
 #else
 .endm
 #endif
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S
index be974cc48..696d404bb 100644
--- a/kernel/power/snrm2.S
+++ b/kernel/power/snrm2.S
@@ -97,7 +97,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	fmr	f0,  f1
 	fmr	f2,  f1
diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S
index a0ff3d1b2..a4292af78 100644
--- a/kernel/power/snrm2_hummer.S
+++ b/kernel/power/snrm2_hummer.S
@@ -119,7 +119,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	cmpwi	cr0, INCX, SIZE
 	bne	LL(100)
diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S
index 0a80d1224..3547d7f47 100644
--- a/kernel/power/snrm2_ppc440.S
+++ b/kernel/power/snrm2_ppc440.S
@@ -105,7 +105,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	fmr	f0,  f1
 	fmr	f2,  f1
diff --git a/kernel/power/zgemm_kernel_power10.c b/kernel/power/zgemm_kernel_power10.c
new file mode 100644
index 000000000..370d12af3
--- /dev/null
+++ b/kernel/power/zgemm_kernel_power10.c
@@ -0,0 +1,736 @@
+/*********************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+#include "common.h"
+#include <altivec.h>
+
+typedef __vector unsigned char  vec_t;
+typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
+
+#define SET_ACC_ZERO()                 \
+      __builtin_mma_xxsetaccz (&acc0); \
+      __builtin_mma_xxsetaccz (&acc1); \
+      __builtin_mma_xxsetaccz (&acc2); \
+      __builtin_mma_xxsetaccz (&acc3); \
+      __builtin_mma_xxsetaccz (&acc4); \
+      __builtin_mma_xxsetaccz (&acc5); \
+      __builtin_mma_xxsetaccz (&acc6); \
+      __builtin_mma_xxsetaccz (&acc7);
+
+#if (defined(NN) || defined(NT) || defined(TN) || defined(TT))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  =  _arbi + _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag +=  _arbi + _aibr; }
+#endif
+
+#if (defined(NR) || defined(NC) || defined(TR) || defined(TC))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr + _aibi; _imag  = -_arbi + _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag += -_arbi + _aibr; }
+#endif
+
+#if (defined(RN) || defined(RT) || defined(CN) || defined(CT))
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr + _aibi; _imag  =  _arbi - _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr + _aibi; _imag +=  _arbi - _aibr; }
+#endif
+
+#if (defined(RR) || defined(RC) || defined(CR) || defined(CC)) 
+#define COMP_MUL(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real  = _arbr - _aibi; _imag  = -_arbi - _aibr; }
+#define COMP_MAC(_real, _arbr, _aibi, _imag, _arbi, _aibr) { _real += _arbr - _aibi; _imag += -_arbi - _aibr; }
+#endif
+
+#if defined(TRMMKERNEL) 
+#define A_OP =
+#else
+#define A_OP +=
+#endif
+
+#define BUILTIN_MMA_DISASSEMBLE_ACC_8                             \
+      __builtin_mma_disassemble_acc ((void *)result, &acc0);      \
+      __builtin_mma_disassemble_acc ((void *)&result[4], &acc1);  \
+      __builtin_mma_disassemble_acc ((void *)&result[8], &acc2);  \
+      __builtin_mma_disassemble_acc ((void *)&result[12], &acc3); \
+      __builtin_mma_disassemble_acc ((void *)&result[16], &acc4); \
+      __builtin_mma_disassemble_acc ((void *)&result[20], &acc5); \
+      __builtin_mma_disassemble_acc ((void *)&result[24], &acc6); \
+      __builtin_mma_disassemble_acc ((void *)&result[28], &acc7); 
+
+#define SAVE_ACC_COMPLEX_11                                       \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10])  \
+      COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+      COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42])  \
+      COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+      COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58])  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_12                                       \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MUL(tr[1], res[ 8], res[11], ti[1], res[ 9], res[10])  \
+      COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+      COMP_MAC(tr[1], res[24], res[27], ti[1], res[25], res[26])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[1], res[40], res[43], ti[1], res[41], res[42])  \
+      COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+      COMP_MAC(tr[1], res[56], res[59], ti[1], res[57], res[58])  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
+      CO[2*ldc+0] A_OP tr[1] * alpha_r - ti[1] * alpha_i;         \
+      CO[2*ldc+1] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_1                                     \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
+      COMP_MAC(tr[0], res[ 8], res[11], ti[0], res[ 9], res[10])  \
+      COMP_MAC(tr[1], res[12], res[15], ti[1], res[13], res[14])  \
+      COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+      COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22])  \
+      COMP_MAC(tr[0], res[24], res[27], ti[0], res[25], res[26])  \
+      COMP_MAC(tr[1], res[28], res[31], ti[1], res[29], res[30])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
+      COMP_MAC(tr[0], res[40], res[43], ti[0], res[41], res[42])  \
+      COMP_MAC(tr[1], res[44], res[47], ti[1], res[45], res[46])  \
+      COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+      COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54])  \
+      COMP_MAC(tr[0], res[56], res[59], ti[0], res[57], res[58])  \
+      COMP_MAC(tr[1], res[60], res[63], ti[1], res[61], res[62])  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;               \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_2                                     \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
+      COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10])  \
+      COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
+      COMP_MAC(tr[0], res[16], res[19], ti[0], res[17], res[18])  \
+      COMP_MAC(tr[1], res[20], res[23], ti[1], res[21], res[22])  \
+      COMP_MAC(tr[2], res[24], res[27], ti[2], res[25], res[26])  \
+      COMP_MAC(tr[3], res[28], res[31], ti[3], res[29], res[30])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
+      COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42])  \
+      COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46])  \
+      COMP_MAC(tr[0], res[48], res[51], ti[0], res[49], res[50])  \
+      COMP_MAC(tr[1], res[52], res[55], ti[1], res[53], res[54])  \
+      COMP_MAC(tr[2], res[56], res[59], ti[2], res[57], res[58])  \
+      COMP_MAC(tr[3], res[60], res[63], ti[3], res[61], res[62])  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;               \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;               \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;               \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;               \
+      CO[4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;               \
+      CO[5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;               \
+      CO[6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;               \
+      CO[7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_21_4                                     \
+      BUILTIN_MMA_DISASSEMBLE_ACC_8                               \
+      COMP_MUL(tr[0], res[ 0], res[ 3], ti[0], res[ 1], res[ 2])  \
+      COMP_MUL(tr[1], res[ 4], res[ 7], ti[1], res[ 5], res[ 6])  \
+      COMP_MUL(tr[2], res[ 8], res[11], ti[2], res[ 9], res[10])  \
+      COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
+      COMP_MUL(tr[4], res[16], res[19], ti[4], res[17], res[18])  \
+      COMP_MUL(tr[5], res[20], res[23], ti[5], res[21], res[22])  \
+      COMP_MUL(tr[6], res[24], res[27], ti[6], res[25], res[26])  \
+      COMP_MUL(tr[7], res[28], res[31], ti[7], res[29], res[30])  \
+      COMP_MAC(tr[0], res[32], res[35], ti[0], res[33], res[34])  \
+      COMP_MAC(tr[1], res[36], res[39], ti[1], res[37], res[38])  \
+      COMP_MAC(tr[2], res[40], res[43], ti[2], res[41], res[42])  \
+      COMP_MAC(tr[3], res[44], res[47], ti[3], res[45], res[46])  \
+      COMP_MAC(tr[4], res[48], res[51], ti[4], res[49], res[50])  \
+      COMP_MAC(tr[5], res[52], res[55], ti[5], res[53], res[54])  \
+      COMP_MAC(tr[6], res[56], res[59], ti[6], res[57], res[58])  \
+      COMP_MAC(tr[7], res[60], res[63], ti[7], res[61], res[62])  \
+      CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;              \
+      CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;              \
+      CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;              \
+      CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;              \
+      CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;              \
+      CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;              \
+      CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;              \
+      CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;              \
+      CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;              \
+      CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;              \
+      CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;              \
+      CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;              \
+      CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;              \
+      CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;              \
+      CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;              \
+      CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_22_1                                      \
+      __builtin_mma_disassemble_acc ((void *)result, &acc0);       \
+      __builtin_mma_disassemble_acc ((void *)(&result[4]), &acc1); \
+      COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2])       \
+      COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6])       \
+      COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10])     \
+      COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14] )  \
+      CO[0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;                \
+      CO[1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;                \
+      CO[2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;                \
+      CO[3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;                \
+      CO[2*ldc+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;          \
+      CO[2*ldc+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;          \
+      CO[2*ldc+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;          \
+      CO[2*ldc+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define SAVE_ACC_COMPLEX_22_2(ACC1, ACC2, CI)                     \
+      __builtin_mma_disassemble_acc ((void *)result, ACC1);       \
+      __builtin_mma_disassemble_acc ((void *)(&result[4]), ACC2); \
+      COMP_MUL(tr[0], res[0], res[3], ti[0], res[1], res[2])      \
+      COMP_MUL(tr[1], res[4], res[7], ti[1], res[5], res[6])      \
+      COMP_MUL(tr[2], res[8], res[11], ti[2], res[9], res[10])    \
+      COMP_MUL(tr[3], res[12], res[15], ti[3], res[13], res[14])  \
+      CO[CI+0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;            \
+      CO[CI+1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;            \
+      CO[CI+2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;            \
+      CO[CI+3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;            \
+      CO[2*ldc+CI+0] A_OP tr[2] * alpha_r - ti[2] * alpha_i;      \
+      CO[2*ldc+CI+1] A_OP ti[2] * alpha_r + tr[2] * alpha_i;      \
+      CO[2*ldc+CI+2] A_OP tr[3] * alpha_r - ti[3] * alpha_i;      \
+      CO[2*ldc+CI+3] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+
+#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+#define REFRESH_TEMP_BK(x, y) \
+      temp = k - off;
+#elif defined(LEFT)
+#define REFRESH_TEMP_BK(x, y) \
+      temp = off + x;
+#else
+#define REFRESH_TEMP_BK(x, y) \
+      temp = off + y;
+#endif
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_POINTERS(x, y)  \
+      BO = B;                   \
+      REFRESH_TEMP_BK(x, y)
+#else
+#define REFRESH_POINTERS(x, y)  \
+      AO += off * (2*x);        \
+      BO = B + off * (2*y);     \
+      REFRESH_TEMP_BK(x, y)
+#endif
+
+#ifdef LEFT
+#define REFRESH_OFF(x) \
+      off += x;
+#else
+#define REFRESH_OFF(x)
+#endif
+
+#ifdef LEFT
+#define UPDATE_TEMP(x, y) \
+      temp -= x;
+#else
+#define UPDATE_TEMP(x, y) \
+      temp -= y;
+#endif
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+#define REFRESH_TMP_AFTER_SAVE(x, y) \
+      temp = k - off;                \
+      UPDATE_TEMP(x, y)              \
+      AO += temp * (2*x);            \
+      BO += temp * (2*y);
+#else
+#define REFRESH_TMP_AFTER_SAVE(x, y)
+#endif
+
+#define REFRESH_AFTER_SAVE(x,y)    \
+      REFRESH_TMP_AFTER_SAVE(x, y) \
+      REFRESH_OFF(x)
+/*************************************************************************************
+* GEMM Kernel
+*************************************************************************************/
+int
+#ifdef TRMMKERNEL
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, 
+       FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc, BLASLONG offset)
+#else
+CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i, 
+       FLOAT * A, FLOAT * B, FLOAT * C, BLASLONG ldc)
+#endif
+{
+   BLASLONG i1, i, l, temp;
+   FLOAT *AO, *BO, *CO;
+#if defined(TRMMKERNEL)
+   BLASLONG off;
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+   off = -offset;
+#endif
+   __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
+
+   v4sf_t result[32];
+   FLOAT *res, tr[16], ti[16];
+   res = (FLOAT *) result;
+
+   for (i1 = 0; i1 < (n >> 1); i1++) {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc<<2;
+      for (i = 0; i < (m >> 3); i++) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (8, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf64gerpp(&acc4, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc5, rowA2, rowB2);
+            __builtin_mma_xvf64gerpp(&acc6, rowA3, rowB2);
+            __builtin_mma_xvf64gerpp(&acc7, rowA4, rowB2);
+         }
+         __builtin_mma_disassemble_acc ((void *)result,        &acc0);
+         __builtin_mma_disassemble_acc ((void *)(&result[ 4]), &acc1);
+         __builtin_mma_disassemble_acc ((void *)(&result[ 8]), &acc2);
+         __builtin_mma_disassemble_acc ((void *)(&result[12]), &acc3);
+         __builtin_mma_disassemble_acc ((void *)(&result[16]), &acc4);
+         __builtin_mma_disassemble_acc ((void *)(&result[20]), &acc5);
+         __builtin_mma_disassemble_acc ((void *)(&result[24]), &acc6);
+         __builtin_mma_disassemble_acc ((void *)(&result[28]), &acc7);
+         COMP_MUL(tr[ 0], res[ 0], res[ 3], ti[ 0], res[ 1], res[ 2])
+         COMP_MUL(tr[ 1], res[ 4], res[ 7], ti[ 1], res[ 5], res[ 6])
+         COMP_MUL(tr[ 2], res[ 8], res[11], ti[ 2], res[ 9], res[10])
+         COMP_MUL(tr[ 3], res[12], res[15], ti[ 3], res[13], res[14])
+         COMP_MUL(tr[ 4], res[16], res[19], ti[ 4], res[17], res[18])
+         COMP_MUL(tr[ 5], res[20], res[23], ti[ 5], res[21], res[22])
+         COMP_MUL(tr[ 6], res[24], res[27], ti[ 6], res[25], res[26])
+         COMP_MUL(tr[ 7], res[28], res[31], ti[ 7], res[29], res[30])
+         COMP_MUL(tr[ 8], res[32], res[35], ti[ 8], res[33], res[34])
+         COMP_MUL(tr[ 9], res[36], res[39], ti[ 9], res[37], res[38])
+         COMP_MUL(tr[10], res[40], res[43], ti[10], res[41], res[42])
+         COMP_MUL(tr[11], res[44], res[47], ti[11], res[45], res[46])
+         COMP_MUL(tr[12], res[48], res[51], ti[12], res[49], res[50])
+         COMP_MUL(tr[13], res[52], res[55], ti[13], res[53], res[54])
+         COMP_MUL(tr[14], res[56], res[59], ti[14], res[57], res[58])
+         COMP_MUL(tr[15], res[60], res[63], ti[15], res[61], res[62])
+         CO[ 0] A_OP tr[0] * alpha_r - ti[0] * alpha_i;
+         CO[ 1] A_OP ti[0] * alpha_r + tr[0] * alpha_i;
+         CO[ 2] A_OP tr[1] * alpha_r - ti[1] * alpha_i;
+         CO[ 3] A_OP ti[1] * alpha_r + tr[1] * alpha_i;
+         CO[ 4] A_OP tr[2] * alpha_r - ti[2] * alpha_i;
+         CO[ 5] A_OP ti[2] * alpha_r + tr[2] * alpha_i;
+         CO[ 6] A_OP tr[3] * alpha_r - ti[3] * alpha_i;
+         CO[ 7] A_OP ti[3] * alpha_r + tr[3] * alpha_i;
+         CO[ 8] A_OP tr[4] * alpha_r - ti[4] * alpha_i;
+         CO[ 9] A_OP ti[4] * alpha_r + tr[4] * alpha_i;
+         CO[10] A_OP tr[5] * alpha_r - ti[5] * alpha_i;
+         CO[11] A_OP ti[5] * alpha_r + tr[5] * alpha_i;
+         CO[12] A_OP tr[6] * alpha_r - ti[6] * alpha_i;
+         CO[13] A_OP ti[6] * alpha_r + tr[6] * alpha_i;
+         CO[14] A_OP tr[7] * alpha_r - ti[7] * alpha_i;
+         CO[15] A_OP ti[7] * alpha_r + tr[7] * alpha_i;
+         CO[2*ldc+ 0] A_OP tr[ 8] * alpha_r - ti[ 8] * alpha_i;
+         CO[2*ldc+ 1] A_OP ti[ 8] * alpha_r + tr[ 8] * alpha_i;
+         CO[2*ldc+ 2] A_OP tr[ 9] * alpha_r - ti[ 9] * alpha_i;
+         CO[2*ldc+ 3] A_OP ti[ 9] * alpha_r + tr[ 9] * alpha_i;
+         CO[2*ldc+ 4] A_OP tr[10] * alpha_r - ti[10] * alpha_i;
+         CO[2*ldc+ 5] A_OP ti[10] * alpha_r + tr[10] * alpha_i;
+         CO[2*ldc+ 6] A_OP tr[11] * alpha_r - ti[11] * alpha_i;
+         CO[2*ldc+ 7] A_OP ti[11] * alpha_r + tr[11] * alpha_i;
+         CO[2*ldc+ 8] A_OP tr[12] * alpha_r - ti[12] * alpha_i;
+         CO[2*ldc+ 9] A_OP ti[12] * alpha_r + tr[12] * alpha_i;
+         CO[2*ldc+10] A_OP tr[13] * alpha_r - ti[13] * alpha_i;
+         CO[2*ldc+11] A_OP ti[13] * alpha_r + tr[13] * alpha_i;
+         CO[2*ldc+12] A_OP tr[14] * alpha_r - ti[14] * alpha_i;
+         CO[2*ldc+13] A_OP ti[14] * alpha_r + tr[14] * alpha_i;
+         CO[2*ldc+14] A_OP tr[15] * alpha_r - ti[15] * alpha_i;
+         CO[2*ldc+15] A_OP ti[15] * alpha_r + tr[15] * alpha_i;
+
+         AO += temp << 4;
+         BO += temp << 2;
+         CO += 16;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (8, 2)
+#endif
+      }
+      if (m & 4) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (4, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
+            __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB3);
+            __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB3);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB4);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc3, rowA2, rowB2);
+         }
+         SAVE_ACC_COMPLEX_22_2(&acc0, &acc2, 0)
+         SAVE_ACC_COMPLEX_22_2(&acc1, &acc3, 4)
+         AO += temp << 3;
+         BO += temp << 2;
+         CO += 8;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (4, 2)
+#endif
+      }
+      if (m & 2) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (2, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
+            __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
+            __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
+            __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
+            __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+         }
+         SAVE_ACC_COMPLEX_22_1
+         AO += temp << 2;
+         BO += temp << 2;
+         CO += 4;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (2, 2)
+#endif
+      }
+      if (m & 1) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (1, 2)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<2)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<2)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<2)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<2)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<2)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<2)+14];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+            __builtin_mma_xvf64gerpp(&acc0, rowA2, rowB3);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB4);
+            __builtin_mma_xvf64gerpp(&acc0, rowA3, rowB5);
+            __builtin_mma_xvf64gerpp(&acc1, rowA3, rowB6);
+            __builtin_mma_xvf64gerpp(&acc0, rowA4, rowB7);
+            __builtin_mma_xvf64gerpp(&acc1, rowA4, rowB8);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<2];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<2)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA1, rowB2);
+         }
+         SAVE_ACC_COMPLEX_12                
+         AO += temp << 1;
+         BO += temp << 2;
+         CO += 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (1, 2)
+#endif
+      }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 2;                 // number of values in A
+#endif
+      B += k << 2;
+   }
+   if (n & 1) {
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      AO = A;
+      CO = C;
+      C += ldc<<1;
+      for (i = 0; i < (m >> 3); i++) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (8, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~1)); l+=2) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
+            __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<4)+16]));
+            __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<4)+20]));
+            __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<4)+24]));
+            __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<4)+28]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
+            __builtin_mma_xvf64gerpp(&acc0, rowA5, rowB2);
+            __builtin_mma_xvf64gerpp(&acc1, rowA6, rowB2);
+            __builtin_mma_xvf64gerpp(&acc2, rowA7, rowB2);
+            __builtin_mma_xvf64gerpp(&acc3, rowA8, rowB2);
+         }
+         for (l = (temp & (~1)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<4]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<4)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<4)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<4)+12]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB1);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_4
+
+         AO += temp << 4;
+         BO += temp << 1;
+         CO += 16;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (8, 1)
+#endif
+      }
+      if (m & 4) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (4, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~3)); l+=4) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<3)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<3)+12]));
+            __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<3)+16]));
+            __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<3)+20]));
+            __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<3)+24]));
+            __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<3)+28]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB2);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB2);
+            __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB3);
+            __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB3);
+            __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB4);
+            __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB4);
+         }
+         for (l = (temp & (~3)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<3]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<3)+4]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_2
+         AO += temp << 3;
+         BO += temp << 1;
+         CO += 8;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (4, 1)
+#endif
+      } 
+      if (m & 2) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (2, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<2)+4]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<2)+8]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<2)+12]));
+            __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<2)+16]));
+            __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<2)+20]));
+            __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<2)+24]));
+            __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<2)+28]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<2]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_21_1
+         AO += temp << 2;
+         BO += temp << 1;
+         CO += 4;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (2, 1)
+#endif
+      }
+      if (m & 1) {
+#if defined(TRMMKERNEL)
+         REFRESH_POINTERS (1, 1)
+#else
+         BO = B;
+         temp = k;
+#endif
+         SET_ACC_ZERO()
+         for (l = 0; l < (temp & (~7)); l+=8) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+            __vector_pair rowA2 = *((__vector_pair *)((void *)&AO[(l<<1)+2]));
+            __vector_pair rowA3 = *((__vector_pair *)((void *)&AO[(l<<1)+4]));
+            __vector_pair rowA4 = *((__vector_pair *)((void *)&AO[(l<<1)+6]));
+            __vector_pair rowA5 = *((__vector_pair *)((void *)&AO[(l<<1)+8]));
+            __vector_pair rowA6 = *((__vector_pair *)((void *)&AO[(l<<1)+10]));
+            __vector_pair rowA7 = *((__vector_pair *)((void *)&AO[(l<<1)+12]));
+            __vector_pair rowA8 = *((__vector_pair *)((void *)&AO[(l<<1)+14]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            vec_t rowB2 = *(vec_t *) & BO[(l<<1)+2];
+            vec_t rowB3 = *(vec_t *) & BO[(l<<1)+4];
+            vec_t rowB4 = *(vec_t *) & BO[(l<<1)+6];
+            vec_t rowB5 = *(vec_t *) & BO[(l<<1)+8];
+            vec_t rowB6 = *(vec_t *) & BO[(l<<1)+10];
+            vec_t rowB7 = *(vec_t *) & BO[(l<<1)+12];
+            vec_t rowB8 = *(vec_t *) & BO[(l<<1)+14];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+            __builtin_mma_xvf64gerpp(&acc1, rowA2, rowB2);
+            __builtin_mma_xvf64gerpp(&acc2, rowA3, rowB3);
+            __builtin_mma_xvf64gerpp(&acc3, rowA4, rowB4);
+            __builtin_mma_xvf64gerpp(&acc4, rowA5, rowB5);
+            __builtin_mma_xvf64gerpp(&acc5, rowA6, rowB6);
+            __builtin_mma_xvf64gerpp(&acc6, rowA7, rowB7);
+            __builtin_mma_xvf64gerpp(&acc7, rowA8, rowB8);
+         }
+         for (l = (temp & (~7)); l < temp; ++l) {
+            __vector_pair rowA1 = *((__vector_pair *)((void *)&AO[l<<1]));
+            vec_t rowB1 = *(vec_t *) & BO[l<<1];
+            __builtin_mma_xvf64gerpp(&acc0, rowA1, rowB1);
+         }
+         SAVE_ACC_COMPLEX_11       
+         AO += temp << 1;
+         BO += temp << 1;
+         CO += 2;
+#if defined(TRMMKERNEL)
+         REFRESH_AFTER_SAVE (1, 1)
+#endif
+      }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 1;                 // number of values in A
+#endif
+      B += k << 1;
+   }
+   return 0;
+}
diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S
index 60f379d25..3048e3480 100644
--- a/kernel/power/znrm2.S
+++ b/kernel/power/znrm2.S
@@ -105,7 +105,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(9999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(9999)
+	beq-	LL(9999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S
index 1d0c598f8..4ef2212df 100644
--- a/kernel/power/znrm2_hummer.S
+++ b/kernel/power/znrm2_hummer.S
@@ -134,7 +134,7 @@
 	cmpwi	cr0, N, 0
 	ble	LL(99)
 	cmpwi	cr0, INCX, 0
-	ble	LL(99)
+	beq	LL(99)
 
 	mr	XX, X
 
diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S
index 778b805de..f775c3e62 100644
--- a/kernel/power/znrm2_ppc440.S
+++ b/kernel/power/znrm2_ppc440.S
@@ -112,7 +112,7 @@
 	cmpwi	cr0, N, 0
 	ble-	LL(999)
 	cmpwi	cr0, INCX, 0
-	ble-	LL(999)
+	beq-	LL(999)
 
 	mr	NN, N
 	mr	XX, X
diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
index 0da66fa35..2798a870e 100644
--- a/kernel/riscv64/KERNEL.C910V
+++ b/kernel/riscv64/KERNEL.C910V
@@ -59,6 +59,7 @@ SDOTKERNEL   = dot_vector.c
 DDOTKERNEL   = dot_vector.c
 CDOTKERNEL   = zdot_vector.c
 ZDOTKERNEL   = zdot_vector.c
+DSDOTKERNEL  = dsdot_vector.c
 
 SNRM2KERNEL  = nrm2_vector.c
 DNRM2KERNEL  = nrm2_vector.c
diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC
index 61a8a2b91..15bcd2289 100644
--- a/kernel/riscv64/KERNEL.RISCV64_GENERIC
+++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC
@@ -45,6 +45,11 @@ DAXPYKERNEL  = ../riscv64/axpy.c
 CAXPYKERNEL  = ../riscv64/zaxpy.c
 ZAXPYKERNEL  = ../riscv64/zaxpy.c
 
+SAXPBYKERNEL  = ../riscv64/axpby.c
+DAXPBYKERNEL  = ../riscv64/axpby.c
+CAXPBYKERNEL  = ../riscv64/zaxpby.c
+ZAXPBYKERNEL  = ../riscv64/zaxpby.c
+
 SCOPYKERNEL  = ../riscv64/copy.c
 DCOPYKERNEL  = ../riscv64/copy.c
 CCOPYKERNEL  = ../riscv64/zcopy.c
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL128B b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
new file mode 100644
index 000000000..fec69ee09
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
@@ -0,0 +1,243 @@
+SAMAXKERNEL  = amax_rvv.c
+DAMAXKERNEL  = amax_rvv.c
+CAMAXKERNEL  = zamax_rvv.c
+ZAMAXKERNEL  = zamax_rvv.c
+
+SAMINKERNEL  = amin_rvv.c
+DAMINKERNEL  = amin_rvv.c
+CAMINKERNEL  = zamin_rvv.c
+ZAMINKERNEL  = zamin_rvv.c
+
+SMAXKERNEL   = max_rvv.c
+DMAXKERNEL   = max_rvv.c
+
+SMINKERNEL   = min_rvv.c
+DMINKERNEL   = min_rvv.c
+
+ISAMAXKERNEL = iamax_rvv.c
+IDAMAXKERNEL = iamax_rvv.c
+ICAMAXKERNEL = izamax_rvv.c
+IZAMAXKERNEL = izamax_rvv.c
+
+ISAMINKERNEL = iamin_rvv.c
+IDAMINKERNEL = iamin_rvv.c
+ICAMINKERNEL = izamin_rvv.c
+IZAMINKERNEL = izamin_rvv.c
+
+ISMAXKERNEL  = imax_rvv.c
+IDMAXKERNEL  = imax_rvv.c
+
+ISMINKERNEL  = imin_rvv.c
+IDMINKERNEL  = imin_rvv.c
+
+SASUMKERNEL  = asum_rvv.c
+DASUMKERNEL  = asum_rvv.c
+CASUMKERNEL  = zasum_rvv.c
+ZASUMKERNEL  = zasum_rvv.c
+
+SSUMKERNEL  = sum_rvv.c
+DSUMKERNEL  = sum_rvv.c
+CSUMKERNEL  = zsum_rvv.c
+ZSUMKERNEL  = zsum_rvv.c
+
+SAXPYKERNEL  = axpy_rvv.c
+DAXPYKERNEL  = axpy_rvv.c
+CAXPYKERNEL  = zaxpy_rvv.c
+ZAXPYKERNEL  = zaxpy_rvv.c
+
+SAXPBYKERNEL  = axpby_rvv.c
+DAXPBYKERNEL  = axpby_rvv.c
+CAXPBYKERNEL  = zaxpby_rvv.c
+ZAXPBYKERNEL  = zaxpby_rvv.c
+
+SCOPYKERNEL  = copy_rvv.c
+DCOPYKERNEL  = copy_rvv.c
+CCOPYKERNEL  = zcopy_rvv.c
+ZCOPYKERNEL  = zcopy_rvv.c
+
+SDOTKERNEL   = dot_rvv.c
+DDOTKERNEL   = dot_rvv.c
+CDOTKERNEL   = zdot_rvv.c
+ZDOTKERNEL   = zdot_rvv.c
+DSDOTKERNEL  = dot_rvv.c
+
+SNRM2KERNEL  = nrm2_rvv.c
+DNRM2KERNEL  = nrm2_rvv.c
+CNRM2KERNEL  = znrm2_rvv.c
+ZNRM2KERNEL  = znrm2_rvv.c
+
+SROTKERNEL   = rot_rvv.c
+DROTKERNEL   = rot_rvv.c
+CROTKERNEL   = zrot_rvv.c
+ZROTKERNEL   = zrot_rvv.c
+
+SSCALKERNEL  = scal_rvv.c
+DSCALKERNEL  = scal_rvv.c
+CSCALKERNEL  = zscal_rvv.c
+ZSCALKERNEL  = zscal_rvv.c
+
+SSWAPKERNEL  = swap_rvv.c
+DSWAPKERNEL  = swap_rvv.c
+CSWAPKERNEL  = zswap_rvv.c
+ZSWAPKERNEL  = zswap_rvv.c
+
+SGEMVNKERNEL = gemv_n_rvv.c
+DGEMVNKERNEL = gemv_n_rvv.c
+CGEMVNKERNEL = zgemv_n_rvv.c
+ZGEMVNKERNEL = zgemv_n_rvv.c
+
+SGEMVTKERNEL = gemv_t_rvv.c
+DGEMVTKERNEL = gemv_t_rvv.c
+CGEMVTKERNEL = zgemv_t_rvv.c
+ZGEMVTKERNEL = zgemv_t_rvv.c
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+STRMMKERNEL	   =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl128b.c
+STRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(SGEMM_UNROLL_M).c
+STRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(SGEMM_UNROLL_M).c
+STRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(SGEMM_UNROLL_M).c
+STRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(SGEMM_UNROLL_M).c
+
+DTRMMKERNEL	   =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl128b.c
+DTRMMUNCOPY_M  =  ../generic/trmm_uncopy_$(DGEMM_UNROLL_M).c
+DTRMMLNCOPY_M  =  ../generic/trmm_lncopy_$(DGEMM_UNROLL_M).c
+DTRMMUTCOPY_M  =  ../generic/trmm_utcopy_$(DGEMM_UNROLL_M).c
+DTRMMLTCOPY_M  =  ../generic/trmm_ltcopy_$(DGEMM_UNROLL_M).c
+
+CTRMMKERNEL	   =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl128b.c
+CTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c
+CTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c
+CTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c
+CTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c
+
+ZTRMMKERNEL	   =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl128b.c
+ZTRMMUNCOPY_M  =  ../generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c
+ZTRMMLNCOPY_M  =  ../generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c
+ZTRMMUTCOPY_M  =  ../generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c
+ZTRMMLTCOPY_M  =  ../generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  symv_U_rvv.c 
+SSYMV_L_KERNEL =  symv_L_rvv.c
+DSYMV_U_KERNEL =  symv_U_rvv.c 
+DSYMV_L_KERNEL =  symv_L_rvv.c
+CSYMV_U_KERNEL =  zsymv_U_rvv.c
+CSYMV_L_KERNEL =  zsymv_L_rvv.c
+ZSYMV_U_KERNEL =  zsymv_U_rvv.c
+ZSYMV_L_KERNEL =  zsymv_L_rvv.c
+
+CHEMV_L_KERNEL =  zhemv_LM_rvv.c
+CHEMV_M_KERNEL =  zhemv_LM_rvv.c
+CHEMV_U_KERNEL =  zhemv_UV_rvv.c
+CHEMV_V_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_L_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_M_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_U_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_V_KERNEL =  zhemv_UV_rvv.c
+
+SSYMMUCOPY_M   =  ../generic/symm_ucopy_$(SGEMM_UNROLL_M).c 
+SSYMMLCOPY_M   =  ../generic/symm_lcopy_$(SGEMM_UNROLL_M).c
+
+DSYMMUCOPY_M   =  ../generic/symm_ucopy_$(DGEMM_UNROLL_M).c 
+DSYMMLCOPY_M   =  ../generic/symm_lcopy_$(DGEMM_UNROLL_M).c
+
+CSYMMUCOPY_M   =  ../generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c 
+CSYMMLCOPY_M   =  ../generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c
+
+ZSYMMUCOPY_M   =  ../generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c 
+ZSYMMLCOPY_M   =  ../generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c
+
+CHEMMLTCOPY_M  =  ../generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c
+CHEMMUTCOPY_M  =  ../generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c
+
+ZHEMMLTCOPY_M  =  ../generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c
+ZHEMMUTCOPY_M  =  ../generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL = ../generic/cabs.c
+DCABS_KERNEL = ../generic/cabs.c
+QCABS_KERNEL = ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = zgemm_beta_rvv.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = zgemm_beta_rvv.c
+endif
diff --git a/kernel/riscv64/KERNEL.RISCV64_ZVL256B b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
new file mode 100644
index 000000000..d8690682f
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
@@ -0,0 +1,199 @@
+SAMAXKERNEL  = amax_vector.c
+DAMAXKERNEL  = amax_vector.c
+CAMAXKERNEL  = zamax_vector.c
+ZAMAXKERNEL  = zamax_vector.c
+
+SAMINKERNEL  = amin_vector.c
+DAMINKERNEL  = amin_vector.c
+CAMINKERNEL  = zamin_vector.c
+ZAMINKERNEL  = zamin_vector.c
+
+SMAXKERNEL   = max_vector.c
+DMAXKERNEL   = max_vector.c
+
+SMINKERNEL   = min_vector.c
+DMINKERNEL   = min_vector.c
+
+ISAMAXKERNEL = iamax_vector.c
+IDAMAXKERNEL = iamax_vector.c
+ICAMAXKERNEL = izamax_vector.c
+IZAMAXKERNEL = izamax_vector.c
+
+ISAMINKERNEL = iamin_vector.c
+IDAMINKERNEL = iamin_vector.c
+ICAMINKERNEL = izamin_vector.c
+IZAMINKERNEL = izamin_vector.c
+
+ISMAXKERNEL  = imax_vector.c
+IDMAXKERNEL  = imax_vector.c
+
+ISMINKERNEL  = imin_vector.c
+IDMINKERNEL  = imin_vector.c
+
+SASUMKERNEL  = asum_vector.c
+DASUMKERNEL  = asum_vector.c
+CASUMKERNEL  = zasum_vector.c
+ZASUMKERNEL  = zasum_vector.c
+
+SSUMKERNEL  = sum_vector.c
+DSUMKERNEL  = sum_vector.c
+CSUMKERNEL  = zsum_vector.c
+ZSUMKERNEL  = zsum_vector.c
+
+SAXPYKERNEL  = axpy_vector.c
+DAXPYKERNEL  = axpy_vector.c
+CAXPYKERNEL  = zaxpy_vector.c
+ZAXPYKERNEL  = zaxpy_vector.c
+
+SCOPYKERNEL  = copy_vector.c
+DCOPYKERNEL  = copy_vector.c
+CCOPYKERNEL  = zcopy_vector.c
+ZCOPYKERNEL  = zcopy_vector.c
+
+SDOTKERNEL   = dot_vector.c
+DDOTKERNEL   = dot_vector.c
+CDOTKERNEL   = zdot_vector.c
+ZDOTKERNEL   = zdot_vector.c
+DSDOTKERNEL  = ../generic/dot.c
+
+SNRM2KERNEL  = nrm2_vector.c
+DNRM2KERNEL  = nrm2_vector.c
+CNRM2KERNEL  = znrm2_vector.c
+ZNRM2KERNEL  = znrm2_vector.c
+
+SROTKERNEL   = rot_vector.c
+DROTKERNEL   = rot_vector.c
+CROTKERNEL   = zrot_vector.c
+ZROTKERNEL   = zrot_vector.c
+
+SSCALKERNEL  = scal_vector.c
+DSCALKERNEL  = scal_vector.c
+CSCALKERNEL  = zscal_vector.c
+ZSCALKERNEL  = zscal_vector.c
+
+SSWAPKERNEL  = swap_vector.c
+DSWAPKERNEL  = swap_vector.c
+CSWAPKERNEL  = zswap_vector.c
+ZSWAPKERNEL  = zswap_vector.c
+
+SGEMVNKERNEL = gemv_n_vector.c
+DGEMVNKERNEL = gemv_n_vector.c
+CGEMVNKERNEL = zgemv_n_vector.c
+ZGEMVNKERNEL = zgemv_n_vector.c
+
+SGEMVTKERNEL = gemv_t_vector.c
+DGEMVTKERNEL = gemv_t_vector.c
+CGEMVTKERNEL = zgemv_t_vector.c
+ZGEMVTKERNEL = zgemv_t_vector.c
+
+STRMMKERNEL	= strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
+DTRMMKERNEL	= dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
+CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
+ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+
+SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_zvl256b.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_zvl256b.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_zvl256b.c
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
+CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
+CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_zvl256b.c
+ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
+ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
+ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
+ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  symv_U_vector.c
+SSYMV_L_KERNEL =  symv_L_vector.c
+DSYMV_U_KERNEL =  symv_U_vector.c
+DSYMV_L_KERNEL =  symv_L_vector.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+CHEMV_L_KERNEL =  zhemv_LM_vector.c
+CHEMV_M_KERNEL =  zhemv_LM_vector.c
+CHEMV_U_KERNEL =  zhemv_UV_vector.c
+CHEMV_V_KERNEL =  zhemv_UV_vector.c
+ZHEMV_L_KERNEL =  zhemv_LM_vector.c
+ZHEMV_M_KERNEL =  zhemv_LM_vector.c
+ZHEMV_U_KERNEL =  zhemv_UV_vector.c
+ZHEMV_V_KERNEL =  zhemv_UV_vector.c
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280
new file mode 100644
index 000000000..86708fe01
--- /dev/null
+++ b/kernel/riscv64/KERNEL.x280
@@ -0,0 +1,281 @@
+# **********************************************************************************
+# Copyright (c) 2022, The OpenBLAS Project
+# All rights reserved.
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+# 3. Neither the name of the OpenBLAS project nor the names of
+# its contributors may be used to endorse or promote products
+# derived from this software without specific prior written permission.
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# **********************************************************************************
+
+SAMAXKERNEL  = amax_rvv.c
+DAMAXKERNEL  = amax_rvv.c
+CAMAXKERNEL  = zamax_rvv.c
+ZAMAXKERNEL  = zamax_rvv.c
+
+SAMINKERNEL  = amin_rvv.c
+DAMINKERNEL  = amin_rvv.c
+CAMINKERNEL  = zamin_rvv.c
+ZAMINKERNEL  = zamin_rvv.c
+
+SMAXKERNEL   = max_rvv.c
+DMAXKERNEL   = max_rvv.c
+
+SMINKERNEL   = min_rvv.c
+DMINKERNEL   = min_rvv.c
+
+ISAMAXKERNEL = iamax_rvv.c
+IDAMAXKERNEL = iamax_rvv.c
+ICAMAXKERNEL = izamax_rvv.c
+IZAMAXKERNEL = izamax_rvv.c
+
+ISAMINKERNEL = iamin_rvv.c
+IDAMINKERNEL = iamin_rvv.c
+ICAMINKERNEL = izamin_rvv.c
+IZAMINKERNEL = izamin_rvv.c
+
+ISMAXKERNEL  = imax_rvv.c
+IDMAXKERNEL  = imax_rvv.c
+
+ISMINKERNEL  = imin_rvv.c
+IDMINKERNEL  = imin_rvv.c
+
+SASUMKERNEL  = asum_rvv.c
+DASUMKERNEL  = asum_rvv.c
+CASUMKERNEL  = zasum_rvv.c
+ZASUMKERNEL  = zasum_rvv.c
+
+SSUMKERNEL  = sum_rvv.c
+DSUMKERNEL  = sum_rvv.c
+CSUMKERNEL  = zsum_rvv.c
+ZSUMKERNEL  = zsum_rvv.c
+
+SAXPYKERNEL  = axpy_rvv.c
+DAXPYKERNEL  = axpy_rvv.c
+CAXPYKERNEL  = zaxpy_rvv.c
+ZAXPYKERNEL  = zaxpy_rvv.c
+
+SAXPBYKERNEL  = axpby_rvv.c
+DAXPBYKERNEL  = axpby_rvv.c
+CAXPBYKERNEL  = zaxpby_rvv.c
+ZAXPBYKERNEL  = zaxpby_rvv.c
+
+SCOPYKERNEL  = copy_rvv.c
+DCOPYKERNEL  = copy_rvv.c
+CCOPYKERNEL  = zcopy_rvv.c
+ZCOPYKERNEL  = zcopy_rvv.c
+
+SDOTKERNEL   = dot_rvv.c
+DDOTKERNEL   = dot_rvv.c
+CDOTKERNEL   = zdot_rvv.c
+ZDOTKERNEL   = zdot_rvv.c
+DSDOTKERNEL  = dot_rvv.c
+
+SNRM2KERNEL  = nrm2_rvv.c
+DNRM2KERNEL  = nrm2_rvv.c
+CNRM2KERNEL  = znrm2_rvv.c
+ZNRM2KERNEL  = znrm2_rvv.c
+
+SROTKERNEL   = rot_rvv.c
+DROTKERNEL   = rot_rvv.c
+CROTKERNEL   = zrot_rvv.c
+ZROTKERNEL   = zrot_rvv.c
+
+SSCALKERNEL  = scal_rvv.c
+DSCALKERNEL  = scal_rvv.c
+CSCALKERNEL  = zscal_rvv.c
+ZSCALKERNEL  = zscal_rvv.c
+
+SSWAPKERNEL  = swap_rvv.c
+DSWAPKERNEL  = swap_rvv.c
+CSWAPKERNEL  = zswap_rvv.c
+ZSWAPKERNEL  = zswap_rvv.c
+
+SGEMVNKERNEL = gemv_n_rvv.c
+DGEMVNKERNEL = gemv_n_rvv.c
+CGEMVNKERNEL = zgemv_n_rvv.c
+ZGEMVNKERNEL = zgemv_n_rvv.c
+
+SGEMVTKERNEL = gemv_t_rvv.c
+DGEMVTKERNEL = gemv_t_rvv.c
+CGEMVTKERNEL = zgemv_t_rvv.c
+ZGEMVTKERNEL = zgemv_t_rvv.c
+
+CTRMMKERNEL     = ztrmmkernel_rvv_v1x4.c
+ZTRMMKERNEL     = ztrmmkernel_rvv_v1x4.c
+
+# SGEMM_UNROLL_N set in params.h
+ifeq ($(SGEMM_UNROLL_N), 8)
+# UNROLL_M is VLMAX
+SGEMMKERNEL    =  gemmkernel_rvv_v1x8.c
+SGEMMINCOPY    =  gemm_ncopy_rvv_v1.c
+SGEMMITCOPY    =  gemm_tcopy_rvv_v1.c
+SGEMMONCOPY    =  gemm_ncopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMOTCOPY    =  gemm_tcopy_$(SGEMM_UNROLL_N)_rvv.c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRMMKERNEL	= trmmkernel_rvv_v1x8.c 
+
+STRMMUNCOPY_M  =  trmm_uncopy_rvv_v1.c
+STRMMLNCOPY_M  =  trmm_lncopy_rvv_v1.c
+STRMMUTCOPY_M  =  trmm_utcopy_rvv_v1.c
+STRMMLTCOPY_M  =  trmm_ltcopy_rvv_v1.c
+
+SSYMMUCOPY_M   =  symm_ucopy_rvv_v1.c 
+SSYMMLCOPY_M   =  symm_lcopy_rvv_v1.c
+endif
+
+# SGEMM_UNROLL_N set in params.h
+ifeq ($(DGEMM_UNROLL_N), 8)
+# UNROLL_M is VLMAX
+DGEMMKERNEL    =  gemmkernel_rvv_v1x8.c
+DGEMMINCOPY    =  gemm_ncopy_rvv_v1.c
+DGEMMITCOPY    =  gemm_tcopy_rvv_v1.c
+DGEMMONCOPY    =  gemm_ncopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMOTCOPY    =  gemm_tcopy_$(DGEMM_UNROLL_N)_rvv.c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRMMKERNEL = trmmkernel_rvv_v1x8.c
+DTRMMUNCOPY_M  =  trmm_uncopy_rvv_v1.c
+DTRMMLNCOPY_M  =  trmm_lncopy_rvv_v1.c
+DTRMMUTCOPY_M  =  trmm_utcopy_rvv_v1.c
+DTRMMLTCOPY_M  =  trmm_ltcopy_rvv_v1.c
+
+DSYMMUCOPY_M   =  symm_ucopy_rvv_v1.c
+DSYMMLCOPY_M   =  symm_lcopy_rvv_v1.c
+endif
+
+CGEMMKERNEL    =  zgemmkernel_rvv_v1x4.c
+CGEMMINCOPY    =  zgemm_ncopy_rvv_v1.c
+CGEMMITCOPY    =  zgemm_tcopy_rvv_v1.c
+CGEMMONCOPY    =  zgemm_ncopy_4_rvv.c
+CGEMMOTCOPY    =  zgemm_tcopy_4_rvv.c
+
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = zgemmkernel_rvv_v1x4.c
+
+ZGEMMINCOPY    =  zgemm_ncopy_rvv_v1.c
+ZGEMMITCOPY    =  zgemm_tcopy_rvv_v1.c
+ZGEMMONCOPY    =  zgemm_ncopy_4_rvv.c
+ZGEMMOTCOPY    =  zgemm_tcopy_4_rvv.c
+
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  trsm_kernel_LN_rvv_v1.c
+STRSMKERNEL_LT	=  trsm_kernel_LT_rvv_v1.c
+STRSMKERNEL_RN	=  trsm_kernel_RN_rvv_v1.c
+STRSMKERNEL_RT	=  trsm_kernel_RT_rvv_v1.c
+
+DTRSMKERNEL_LN	=  trsm_kernel_LN_rvv_v1.c
+DTRSMKERNEL_LT	=  trsm_kernel_LT_rvv_v1.c
+DTRSMKERNEL_RN	=  trsm_kernel_RN_rvv_v1.c
+DTRSMKERNEL_RT	=  trsm_kernel_RT_rvv_v1.c
+
+CTRSMKERNEL_LN  =  trsm_kernel_LN_rvv_v1.c
+CTRSMKERNEL_LT  =  trsm_kernel_LT_rvv_v1.c
+CTRSMKERNEL_RN  =  trsm_kernel_RN_rvv_v1.c
+CTRSMKERNEL_RT  =  trsm_kernel_RT_rvv_v1.c
+
+ZTRSMKERNEL_LN  =  trsm_kernel_LN_rvv_v1.c
+ZTRSMKERNEL_LT  =  trsm_kernel_LT_rvv_v1.c
+ZTRSMKERNEL_RN  =  trsm_kernel_RN_rvv_v1.c
+ZTRSMKERNEL_RT  =  trsm_kernel_RT_rvv_v1.c
+
+TRSMCOPYLN_M    =  trsm_lncopy_rvv_v1.c
+TRSMCOPYLT_M    =  trsm_ltcopy_rvv_v1.c
+TRSMCOPYUN_M    =  trsm_uncopy_rvv_v1.c
+TRSMCOPYUT_M    =  trsm_utcopy_rvv_v1.c
+
+ZTRSMCOPYLN_M   =  ztrsm_lncopy_rvv_v1.c
+ZTRSMCOPYLT_M   =  ztrsm_ltcopy_rvv_v1.c
+ZTRSMCOPYUN_M   =  ztrsm_uncopy_rvv_v1.c
+ZTRSMCOPYUT_M   =  ztrsm_utcopy_rvv_v1.c
+
+SSYMV_U_KERNEL =  symv_U_rvv.c 
+SSYMV_L_KERNEL =  symv_L_rvv.c
+DSYMV_U_KERNEL =  symv_U_rvv.c 
+DSYMV_L_KERNEL =  symv_L_rvv.c
+CSYMV_U_KERNEL =  zsymv_U_rvv.c
+CSYMV_L_KERNEL =  zsymv_L_rvv.c
+ZSYMV_U_KERNEL =  zsymv_U_rvv.c
+ZSYMV_L_KERNEL =  zsymv_L_rvv.c
+
+CHEMV_L_KERNEL =  zhemv_LM_rvv.c
+CHEMV_M_KERNEL =  zhemv_LM_rvv.c
+CHEMV_U_KERNEL =  zhemv_UV_rvv.c
+CHEMV_V_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_L_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_M_KERNEL =  zhemv_LM_rvv.c
+ZHEMV_U_KERNEL =  zhemv_UV_rvv.c
+ZHEMV_V_KERNEL =  zhemv_UV_rvv.c
+
+ZHEMMLTCOPY_M    =  zhemm_ltcopy_rvv_v1.c
+ZHEMMUTCOPY_M    =  zhemm_utcopy_rvv_v1.c
+
+CHEMMLTCOPY_M    =  zhemm_ltcopy_rvv_v1.c
+CHEMMUTCOPY_M    =  zhemm_utcopy_rvv_v1.c
+
+ZSYMMUCOPY_M    =  zsymm_ucopy_rvv_v1.c
+ZSYMMLCOPY_M    =  zsymm_lcopy_rvv_v1.c
+
+CSYMMUCOPY_M    =  zsymm_ucopy_rvv_v1.c
+CSYMMLCOPY_M    =  zsymm_lcopy_rvv_v1.c
+
+ZTRMMUNCOPY_M  =  ztrmm_uncopy_rvv_v1.c
+ZTRMMLNCOPY_M  =  ztrmm_lncopy_rvv_v1.c
+ZTRMMUTCOPY_M  =  ztrmm_utcopy_rvv_v1.c
+ZTRMMLTCOPY_M  =  ztrmm_ltcopy_rvv_v1.c
+
+CTRMMUNCOPY_M  =  ztrmm_uncopy_rvv_v1.c
+CTRMMLNCOPY_M  =  ztrmm_lncopy_rvv_v1.c
+CTRMMUTCOPY_M  =  ztrmm_utcopy_rvv_v1.c
+CTRMMLTCOPY_M  =  ztrmm_ltcopy_rvv_v1.c
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = gemm_beta_rvv.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = zgemm_beta_rvv.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = zgemm_beta_rvv.c
+endif
diff --git a/kernel/riscv64/amax_rvv.c b/kernel/riscv64/amax_rvv.c
new file mode 100644
index 000000000..451fbc834
--- /dev/null
+++ b/kernel/riscv64/amax_rvv.c
@@ -0,0 +1,102 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT maxf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(maxf);
+
+    FLOAT_V_T vx, vmax;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmax = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+
+    return(maxf);
+}
diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
index b778d3e55..b66d4871e 100644
--- a/kernel/riscv64/amax_vector.c
+++ b/kernel/riscv64/amax_vector.c
@@ -28,36 +28,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),      _v_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -65,103 +70,28 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT maxf=0.0;
 	if (n <= 0 || inc_x <= 0) return(maxf);
         unsigned int gvl = 0;
-        FLOAT_V_T v0, v1, v_max;
-        FLOAT_V_T_M1 v_res, v_zero;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_zero = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T v0, v1;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
-        MASK_T mask0, mask1;
-        FLOAT zero = 0.0;
         if(inc_x == 1){
                 gvl = VSETVL(n);
                 if(gvl <= n/2){
-                        v_max = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
-
-                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
-                        maxf = *((FLOAT*)&v_res);
-                        //maxf = v_res[0];
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -169,94 +99,27 @@ asm volatile(
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
-                        v_max = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
-
                                 v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
                                 ix += inc_xv*2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_zero, gvl);
-                        maxf = *((FLOAT*)&v_res);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_zero, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+
+        maxf = EXTRACT_FLOAT(v_res);
 	return(maxf);
 }
 
diff --git a/kernel/riscv64/amin_rvv.c b/kernel/riscv64/amin_rvv.c
new file mode 100644
index 000000000..5186d7b12
--- /dev/null
+++ b/kernel/riscv64/amin_rvv.c
@@ -0,0 +1,102 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT minf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(minf);
+
+    FLOAT_V_T vx, vmin;
+    FLOAT_V_T_M1 v_res;
+    
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+
+    return(minf);
+}
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
index fd2f83dc9..c4578eabf 100644
--- a/kernel/riscv64/amin_vector.c
+++ b/kernel/riscv64/amin_vector.c
@@ -26,232 +26,108 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#include <math.h>
-#include <float.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define ABS fabs
+#       else
+#               define ELEN 32
+#               define ABS fabsf
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define ABS fabs
+#       else
+#               define ELEN 32
+#               define ABS fabsf
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),      _v_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	if (n <= 0 || inc_x <= 0) return(0.0);
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0;
+        FLOAT minf=0.0;
+        if (n <= 0 || inc_x <= 0) return(minf);
+
+        minf = ABS(*x);
+        x += inc_x;
+        --n;
+        if (n == 0) return(minf);
+
         unsigned int gvl = 0;
-        FLOAT_V_T v0, v1, v_min;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T v0, v1;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(minf, 1);
 
-        MASK_T mask0, mask1;
-	    FLOAT zero = 0.0;
         if(inc_x == 1){
                 gvl = VSETVL(n);
                 if(gvl <= n/2){
-                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
-
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
-                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
-                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        BLASLONG inc_xv = inc_x * gvl;
                         for(i=0,j=0; i<n/(gvl*2); i++){
-                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
-
-                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v1)
-        :"vd"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-
-                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
+                                v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
+                                v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
                                 j += gvl*2;
-                                idx += inc_xv*2;
+                                ix += inc_xv*2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
-#if defined(DOUBLE)
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vsetvli    zero, zero, e8, m1\n\t"
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+vd"(v0)
-        :"vd"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
-	return(minf);
-}
-
 
+        minf = EXTRACT_FLOAT(v_res);
+        return(minf);
+}
diff --git a/kernel/riscv64/asum_rvv.c b/kernel/riscv64/asum_rvv.c
new file mode 100644
index 000000000..0ea610cbb
--- /dev/null
+++ b/kernel/riscv64/asum_rvv.c
@@ -0,0 +1,99 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT asumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(asumf);
+
+    FLOAT_V_T vx, vsum;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vsum = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax); 
+    asumf = VFMVFS_FLOAT_M1(v_res);
+    return(asumf);
+}
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
index a82275153..a652eafdd 100644
--- a/kernel/riscv64/asum_vector.c
+++ b/kernel/riscv64/asum_vector.c
@@ -28,111 +28,101 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDSUMVS_FLOAT vfredosum_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFADDVV_FLOAT vfadd_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
-	BLASLONG ix=0;
 	FLOAT asumf=0.0;
 	if (n <= 0 || inc_x <= 0) return(asumf);
         unsigned int gvl = 0;
-        FLOAT_V_T v0, v1, v_zero,v_sum;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T v0, v1, v_sum;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
-        MASK_T mask0, mask1;
         if(inc_x == 1){
                 gvl = VSETVL(n);
-                v_zero = VFMVVF_FLOAT(0, gvl);
                 if(gvl <= n/2){
                         v_sum = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
                 gvl = VSETVL(n);
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
-                v_zero = VFMVVF_FLOAT(0, gvl);
                 if(gvl <= n/2){
                         v_sum = VFMVVF_FLOAT(0, gvl);
-                        BLASLONG inc_xv = inc_x * gvl;
                         for(i=0,j=0; i<n/(gvl*2); i++){
-                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                                v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
-                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                                v1 = VLSEV_FLOAT(&x[(j+gvl)*inc_x], stride_x, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
-                                inc_xv += inc_xv * 2;
                         }
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
-                        asumf += *((FLOAT*)&v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+        asumf = EXTRACT_FLOAT(v_res);
 	return(asumf);
 }
 
diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c
index 278747f75..04f9518d3 100644
--- a/kernel/riscv64/axpby.c
+++ b/kernel/riscv64/axpby.c
@@ -33,7 +33,7 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	ix = 0;
 	iy = 0;
diff --git a/kernel/riscv64/axpby_rvv.c b/kernel/riscv64/axpby_rvv.c
new file mode 100644
index 000000000..d7fb86eab
--- /dev/null
+++ b/kernel/riscv64/axpby_rvv.c
@@ -0,0 +1,173 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+    FLOAT_V_T vx, vy;
+
+    if ( n <= 0     )  return(0);
+
+    if ( beta == 0.0 ) {
+        if ( alpha == 0.0 ) {
+            if (1 == inc_y) {
+                memset(&y[0], 0, n * sizeof(FLOAT));
+            } else {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                size_t vl = VSETVL(n);
+                vy = VFMVVF_FLOAT(0.0, vl);
+                for ( ; n > 0; n -= vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    VSSEV_FLOAT(y, stride_y, vy, vl);
+                }
+            }
+
+        } else {
+            if ((1 == inc_x) && (1 == inc_y)) {
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else if (1 == inc_x) {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            } else if (1 == inc_y) {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VFMULVF_FLOAT(vx, alpha, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            }
+        }
+
+    } else {
+        if ( alpha == 0.0 ) {
+            if (1 == inc_y) {
+                for (size_t vl; n > 0; n -= vl, y += vl) {
+                    vl = VSETVL(n);
+                    vy = VLEV_FLOAT(y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vy = VLSEV_FLOAT(y, stride_y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            }
+
+        } else {
+            if ((1 == inc_x) && (1 == inc_y)) {
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VLEV_FLOAT(y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else if (1 == inc_x) {
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLEV_FLOAT(x, vl);
+                    vy = VLSEV_FLOAT(y, stride_y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            } else if (1 == inc_y) {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VLEV_FLOAT(y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSEV_FLOAT (y, vy, vl);
+                }
+            } else {
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+                    vl = VSETVL(n);
+                    vx = VLSEV_FLOAT(x, stride_x, vl);
+                    vy = VLSEV_FLOAT(y, stride_y, vl);
+                    vy = VFMULVF_FLOAT(vy, beta, vl);
+                    vy = VFMACCVF_FLOAT(vy, alpha, vx, vl);
+                    VSSEV_FLOAT (y, stride_y, vy, vl);
+                }
+            }
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
index 988c57ec2..721aad2b0 100644
--- a/kernel/riscv64/axpby_vector.c
+++ b/kernel/riscv64/axpby_vector.c
@@ -27,31 +27,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMACCVF_FLOAT  JOIN(RISCV_RVV(vfmacc),    _vf_f,  ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMULVF_FLOAT   JOIN(RISCV_RVV(vfmul),     _vf_f,  ELEN,   LMUL,   _)
+
 int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
 {
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
 
 	BLASLONG i=0, j=0;
 	unsigned int gvl = 0;
@@ -60,6 +69,63 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
 
 	BLASLONG stride_x, stride_y, ix = 0, iy = 0;
 
+	if (inc_x == 0 || inc_y == 0) { /* use trivial non-vectorized loop if either increment is zero */
+
+	if ( beta == 0.0 )
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] = 0.0 ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] =  beta * y[iy] ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] + beta * y[iy] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+
+	}
+
+	return(0);
+
+	} else { /* vectorized approach for non-zero increments */
+
         if(beta == 0.0){
                 if(alpha == 0.0){//alpha == 0 && beta == 0
                         if(inc_y == 1){
@@ -372,5 +438,6 @@ int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *
                 }
         }
 	return(0);
+	}
 }
 
diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c
index fb1094dd9..19d12ad3f 100644
--- a/kernel/riscv64/axpy.c
+++ b/kernel/riscv64/axpy.c
@@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 	if ( da == 0.0 ) return(0);
 
 	ix = 0;
diff --git a/kernel/riscv64/axpy_rvv.c b/kernel/riscv64/axpy_rvv.c
new file mode 100644
index 000000000..2d5293f76
--- /dev/null
+++ b/kernel/riscv64/axpy_rvv.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_M1_T            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSEV_FLOAT_M1           __riscv_vse32_v_f32m1
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_M1_T            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSEV_FLOAT_M1           __riscv_vse64_v_f64m1
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    if ( n <= 0    ) return(0);
+    if ( da == 0.0 ) return(0);
+
+    FLOAT_V_T vx, vy;
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSEV_FLOAT (y, vy, vl);
+        }
+
+    } else if (1 == inc_y) {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSEV_FLOAT(y, vy, vl);
+        }
+
+    } else if (1 == inc_x && 0 != inc_y) {
+
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSSEV_FLOAT(y, stride_y, vy, vl);
+        }
+
+    } else if( 0 == inc_y ) {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        size_t in_vl = VSETVL(n);
+        vy = VFMVVF_FLOAT( y[0], in_vl );
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+        }
+        FLOAT_V_M1_T vres = VFMVVF_FLOAT_M1( 0.0f, 1 );
+        vres = VFREDSUMVS_FLOAT( vy, vres, in_vl );
+        VSEV_FLOAT_M1(y, vres, 1);
+    } else {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+            VSSEV_FLOAT(y, stride_y, vy, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
index 98b9f6814..c77a18afa 100644
--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@@ -25,24 +25,45 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
+
 #include "common.h"
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
+#else
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,    	ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_M1_T    JOIN(vfloat,    	ELEN,   m1,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMACCVF_FLOAT  JOIN(RISCV_RVV(vfmacc),    _vf_f, 	ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -53,7 +74,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	FLOAT_V_T vy0, vy1;
 	BLASLONG stride_x, stride_y;
 
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
 	if (da == 0.0) return(0);
 
 	if (inc_x == 1 && inc_y == 1) {
@@ -111,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			VSEV_FLOAT(&y[j], vy0, gvl);
 			j += gvl;
 		}
-        }else if(inc_x == 1){
+	} else if (1 == inc_x && 0 != inc_y) {
 		stride_y = inc_y * sizeof(FLOAT);
                 gvl = VSETVL(n);
                 if(gvl <= n/2){
@@ -139,6 +160,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
 			j += gvl;
 		}
+	} else if( 0 == inc_y ) {
+	        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+	        size_t in_vl = VSETVL(n);
+	        vy0 = VFMVVF_FLOAT( y[0], in_vl );
+
+	        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+	            vl = VSETVL(n);
+	            vx0 = VLSEV_FLOAT(x, stride_x, vl);
+	            vy0 = VFMACCVF_FLOAT(vy0, da, vx0, vl);
+	        }
+	        FLOAT_V_M1_T v_res = VFMVVF_FLOAT_M1( 0.0f, 1 );
+	        v_res = VFREDSUMVS_FLOAT( vy0, v_res, in_vl );
+	        y[0] = EXTRACT_FLOAT(v_res);
 	}else{
 		stride_x = inc_x * sizeof(FLOAT);
 		stride_y = inc_y * sizeof(FLOAT);
diff --git a/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..bd615389c
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_8x4_zvl128b.c
@@ -0,0 +1,996 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='cgemm_kernel_8x4_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
+            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
+            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
+            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C2r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C2i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C3r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C3i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
+            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
+            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
+            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result8 * alphar;
+            Ci += result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
+            Cr += result10 * alphar;
+            Ci += result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result12 * alphar;
+            Ci += result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
+            Cr += result14 * alphar;
+            Ci += result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t C1r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C1i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vlse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t C0i = __riscv_vlse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..7980c029a
--- /dev/null
+++ b/kernel/riscv64/cgemm_kernel_8x8_zvl256b.c
@@ -0,0 +1,1931 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=8
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='cgemm_kernel_8x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C4r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C4i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C5r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C5i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C6r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C6i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C7r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C7i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
+                result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
+                result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
+                result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
+                result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
+                result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
+                result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
+                result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
+                ai+=1*2;
+                bi+=8*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+2*ldc+0)*2+0];
+            Ci = C[(ci+2*ldc+0)*2+1];
+            Cr += result4*alphar;
+            Ci += result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+3*ldc+0)*2+0];
+            Ci = C[(ci+3*ldc+0)*2+1];
+            Cr += result6*alphar;
+            Ci += result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+4*ldc+0)*2+0];
+            Ci = C[(ci+4*ldc+0)*2+1];
+            Cr += result8*alphar;
+            Ci += result9*alphar;
+            Cr -= result9*alphai;
+            Ci += result8*alphai;
+            C[(ci+4*ldc+0)*2+0] = Cr;
+            C[(ci+4*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+5*ldc+0)*2+0];
+            Ci = C[(ci+5*ldc+0)*2+1];
+            Cr += result10*alphar;
+            Ci += result11*alphar;
+            Cr -= result11*alphai;
+            Ci += result10*alphai;
+            C[(ci+5*ldc+0)*2+0] = Cr;
+            C[(ci+5*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+6*ldc+0)*2+0];
+            Ci = C[(ci+6*ldc+0)*2+1];
+            Cr += result12*alphar;
+            Ci += result13*alphar;
+            Cr -= result13*alphai;
+            Ci += result12*alphai;
+            C[(ci+6*ldc+0)*2+0] = Cr;
+            C[(ci+6*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+7*ldc+0)*2+0];
+            Ci = C[(ci+7*ldc+0)*2+1];
+            Cr += result14*alphar;
+            Ci += result15*alphar;
+            Cr -= result15*alphai;
+            Ci += result14*alphai;
+            C[(ci+7*ldc+0)*2+0] = Cr;
+            C[(ci+7*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C2r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C2i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C3r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C3i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+2*ldc+0)*2+0];
+            Ci = C[(ci+2*ldc+0)*2+1];
+            Cr += result4*alphar;
+            Ci += result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+3*ldc+0)*2+0];
+            Ci = C[(ci+3*ldc+0)*2+1];
+            Cr += result6*alphar;
+            Ci += result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat32m1_t C1r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C1i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vlse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t C0i = __riscv_vlse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c
index 7b4f04f30..e79ca59af 100644
--- a/kernel/riscv64/copy.c
+++ b/kernel/riscv64/copy.c
@@ -41,7 +41,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	while(i < n)
 	{
diff --git a/kernel/riscv64/copy_rvv.c b/kernel/riscv64/copy_rvv.c
new file mode 100644
index 000000000..9d4b84095
--- /dev/null
+++ b/kernel/riscv64/copy_rvv.c
@@ -0,0 +1,94 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+    if(n <= 0)  return(0);
+
+    FLOAT_V_T v0;
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+            v0 = VLEV_FLOAT(x, vl);
+            VSEV_FLOAT(y, v0, vl);
+        }
+
+    } else if (inc_y == 1) {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            VSEV_FLOAT(y, v0, vl);
+        }
+
+    } else if(inc_x == 1) {
+
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+            v0 = VLEV_FLOAT(x, vl);
+            VSSEV_FLOAT(y, stride_y, v0, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            VSSEV_FLOAT(y, stride_y, v0, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
index a46136d6c..ccbd6e482 100644
--- a/kernel/riscv64/copy_vector.c
+++ b/kernel/riscv64/copy_vector.c
@@ -25,22 +25,35 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#       else
+#               define ELEN 32
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0, j=0;
@@ -58,7 +71,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/4){
                         BLASLONG inc_xv = inc_x * gvl;
-                        BLASLONG gvl3 = gvl * 3;
+                        unsigned int gvl3 = gvl * 3;
                         BLASLONG inc_xv3 = inc_xv * 3;
                         for(i=0,j=0; i<n/(4*gvl); i++){
                                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
@@ -86,7 +99,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                 if(gvl <= n/4){
                         BLASLONG inc_yv = inc_y * gvl;
                         BLASLONG inc_yv3 = inc_yv * 3;
-                        BLASLONG gvl3 = gvl * 3;
+                        unsigned int gvl3 = gvl * 3;
                         for(i=0,j=0; i<n/(4*gvl); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
                                 VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
diff --git a/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..3268cb810
--- /dev/null
+++ b/kernel/riscv64/ctrmm_kernel_8x4_zvl128b.c
@@ -0,0 +1,1102 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='ctrmm_kernel_8x4_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl);
+            vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl);
+            vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl);
+            vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            float B2r = B[bi + 2 * 2 + 0];
+            float B2i = B[bi + 2 * 2 + 1];
+            float B3r = B[bi + 3 * 2 + 0];
+            float B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat32m2_t ACC2r = tmp0r;
+            vfloat32m2_t ACC2i = tmp0i;
+            vfloat32m2_t ACC3r = tmp1r;
+            vfloat32m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            vfloat32m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl);
+            vfloat32m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl);
+            vfloat32m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl);
+            vfloat32m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result8 * alphar;
+            Ci = result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result10 * alphar;
+            Ci = result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result12 * alphar;
+            Ci = result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result14 * alphar;
+            Ci = result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            float B1r = B[bi + 1 * 2 + 0];
+            float B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            vfloat32m2_t tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+            vfloat32m2_t tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+            vfloat32m2_t ACC1r = tmp1r;
+            vfloat32m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat32m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat32m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 8 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 8 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0r = B[bi + 0 * 2 + 0];
+            float B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat32m2_t A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat32m2_t A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat32m2_t tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+            vfloat32m2_t tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat32m2_t ACC0r = tmp0r;
+            vfloat32m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse32_v_f32m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f32m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat32m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse32_v_f32m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            float Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..b4f111a69
--- /dev/null
+++ b/kernel/riscv64/ctrmm_kernel_8x8_zvl256b.c
@@ -0,0 +1,2007 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=8
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='ctrmm_kernel_8x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            float B4r = B[bi+4*2+0];
+            float B4i = B[bi+4*2+1];
+            float B5r = B[bi+5*2+0];
+            float B5i = B[bi+5*2+1];
+            float B6r = B[bi+6*2+0];
+            float B6i = B[bi+6*2+1];
+            float B7r = B[bi+7*2+0];
+            float B7i = B[bi+7*2+1];
+            bi += 8*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 14 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+            tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+            tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+            tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+            tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+            tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+            tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+            tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+            vfloat32m1_t ACC4r = tmp0r;
+            vfloat32m1_t ACC4i = tmp0i;
+            vfloat32m1_t ACC5r = tmp1r;
+            vfloat32m1_t ACC5i = tmp1i;
+            vfloat32m1_t ACC6r = tmp2r;
+            vfloat32m1_t ACC6i = tmp2i;
+            vfloat32m1_t ACC7r = tmp3r;
+            vfloat32m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                B4r = B[bi+4*2+0];
+                B4i = B[bi+4*2+1];
+                B5r = B[bi+5*2+0];
+                B5i = B[bi+5*2+1];
+                B6r = B[bi+6*2+0];
+                B6i = B[bi+6*2+1];
+                B7r = B[bi+7*2+0];
+                B7i = B[bi+7*2+1];
+                bi += 8*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B4i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B4i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B5i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B5i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B6i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B6i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B7i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B7i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B4r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B4r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B5r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B5r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B6r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B6r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B7r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B7r, A0i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat32m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat32m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat32m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat32m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat32m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat32m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat32m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat32m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*8*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                result8+=S0*A[ai+0+0]*B[bi+8+0] + S1*A[ai+0+1]*B[bi+8+1];
+                result9+=S2*A[ai+0+1]*B[bi+8+0] + S3*A[ai+0+0]*B[bi+8+1];
+                result10+=S0*A[ai+0+0]*B[bi+10+0] + S1*A[ai+0+1]*B[bi+10+1];
+                result11+=S2*A[ai+0+1]*B[bi+10+0] + S3*A[ai+0+0]*B[bi+10+1];
+                result12+=S0*A[ai+0+0]*B[bi+12+0] + S1*A[ai+0+1]*B[bi+12+1];
+                result13+=S2*A[ai+0+1]*B[bi+12+0] + S3*A[ai+0+0]*B[bi+12+1];
+                result14+=S0*A[ai+0+0]*B[bi+14+0] + S1*A[ai+0+1]*B[bi+14+1];
+                result15+=S2*A[ai+0+1]*B[bi+14+0] + S3*A[ai+0+0]*B[bi+14+1];
+                ai+=1*2;
+                bi+=8*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = result4*alphar;
+            Ci = result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = result6*alphar;
+            Ci = result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            Cr = result8*alphar;
+            Ci = result9*alphar;
+            Cr -= result9*alphai;
+            Ci += result8*alphai;
+            C[(ci+4*ldc+0)*2+0] = Cr;
+            C[(ci+4*ldc+0)*2+1] = Ci;
+            Cr = result10*alphar;
+            Ci = result11*alphar;
+            Cr -= result11*alphai;
+            Ci += result10*alphai;
+            C[(ci+5*ldc+0)*2+0] = Cr;
+            C[(ci+5*ldc+0)*2+1] = Ci;
+            Cr = result12*alphar;
+            Ci = result13*alphar;
+            Cr -= result13*alphai;
+            Ci += result12*alphai;
+            C[(ci+6*ldc+0)*2+0] = Cr;
+            C[(ci+6*ldc+0)*2+1] = Ci;
+            Cr = result14*alphar;
+            Ci = result15*alphar;
+            Cr -= result15*alphai;
+            Ci += result14*alphai;
+            C[(ci+7*ldc+0)*2+0] = Cr;
+            C[(ci+7*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            float B2r = B[bi+2*2+0];
+            float B2i = B[bi+2*2+1];
+            float B3r = B[bi+3*2+0];
+            float B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            vfloat32m1_t tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+            vfloat32m1_t tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+            vfloat32m1_t tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+            vfloat32m1_t tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+            vfloat32m1_t ACC2r = tmp2r;
+            vfloat32m1_t ACC2i = tmp2i;
+            vfloat32m1_t ACC3r = tmp3r;
+            vfloat32m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f32m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f32m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f32m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f32m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat32m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat32m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat32m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat32m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = result4*alphar;
+            Ci = result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = result6*alphar;
+            Ci = result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            float B1r = B[bi+1*2+0];
+            float B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            vfloat32m1_t tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+            vfloat32m1_t tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+            vfloat32m1_t ACC1r = tmp1r;
+            vfloat32m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f32m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f32m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat32m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat32m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e32m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0r = B[bi+0*2+0];
+            float B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat32m1_t A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat32m1_t A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat32m1_t tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+            vfloat32m1_t tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat32m1_t ACC0r = tmp0r;
+            vfloat32m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse32_v_f32m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f32m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f32m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat32m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse32_v_f32m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse32_v_f32m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            float Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/dgemm_kernel_8x4_c910v.c b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
index 79a7a98d9..b9bccbd49 100644
--- a/kernel/riscv64/dgemm_kernel_8x4_c910v.c
+++ b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
@@ -196,7 +196,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
 		   
 		   asm volatile(
 				"vsetvli    zero, zero, e64,m1 \n\t"
-				"fmv.w.x    ft11, zero         \n\t"
+				"fmv.d.x    ft11, zero         \n\t"
 				"mv         t0,   %[BK]        \n\t"
 				
 				"vfmv.v.f   v16,  ft11         \n\t"
diff --git a/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..a613f0bce
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_8x4_zvl128b.c
@@ -0,0 +1,492 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=4
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=4
+ VFMACC='__riscv_vfmacc_vf_f64m4'
+ VFMUL='__riscv_vfmul_vf_f64m4'
+ VLEV='__riscv_vle64_v_f64m4'
+ VLSEV='__riscv_vlse64_v_f64m4'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
+ VSETVL='__riscv_vsetvl_e64m4'
+ VSEV='__riscv_vse64_v_f64m4'
+ VSSEV='__riscv_vsse64_v_f64m4'
+ acc_vector_t='vfloat64m4_t'
+ output='dgemm_kernel_8x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m4_t'
+
+*/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m4(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c2 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c3 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f64m4(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f64m4(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            C[ci + 2 * ldc + 0] += alpha * result4;
+            C[ci + 2 * ldc + 1] += alpha * result5;
+            C[ci + 3 * ldc + 0] += alpha * result6;
+            C[ci + 3 * ldc + 1] += alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            C[ci + 2 * ldc + 0] += alpha * result2;
+            C[ci + 3 * ldc + 0] += alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m4_t c1 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f64m4(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vle64_v_f64m4(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m4(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c b/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..760bfc893
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_8x8_zvl256b.c
@@ -0,0 +1,860 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='dgemm_kernel_8x8_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+            vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
+            vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
+            vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
+            vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+            vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c8 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c9 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c10 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c11 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c12 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c13 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c14 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c15 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
+            c8 = __riscv_vfmacc_vf_f64m1( c8, alpha, result8, gvl );
+            c9 = __riscv_vfmacc_vf_f64m1( c9, alpha, result9, gvl );
+            c10 = __riscv_vfmacc_vf_f64m1( c10, alpha, result10, gvl );
+            c11 = __riscv_vfmacc_vf_f64m1( c11, alpha, result11, gvl );
+            c12 = __riscv_vfmacc_vf_f64m1( c12, alpha, result12, gvl );
+            c13 = __riscv_vfmacc_vf_f64m1( c13, alpha, result13, gvl );
+            c14 = __riscv_vfmacc_vf_f64m1( c14, alpha, result14, gvl );
+            c15 = __riscv_vfmacc_vf_f64m1( c15, alpha, result15, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c15, gvl);
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            C[ci+4*ldc+0] += alpha * result8;
+            C[ci+4*ldc+1] += alpha * result9;
+            C[ci+5*ldc+0] += alpha * result10;
+            C[ci+5*ldc+1] += alpha * result11;
+            C[ci+6*ldc+0] += alpha * result12;
+            C[ci+6*ldc+1] += alpha * result13;
+            C[ci+7*ldc+0] += alpha * result14;
+            C[ci+7*ldc+1] += alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            C[ci+4*ldc+0] += alpha * result4;
+            C[ci+5*ldc+0] += alpha * result5;
+            C[ci+6*ldc+0] += alpha * result6;
+            C[ci+7*ldc+0] += alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c4 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c5 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c6 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c7 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f64m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f64m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f64m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f64m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat64m1_t c2 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c3 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f64m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f64m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl); ci += gvl;
+            vfloat64m1_t c1 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f64m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vle64_v_f64m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f64m1( c0, alpha, result0, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
index bf55998ca..8ad493a2b 100644
--- a/kernel/riscv64/dot.c
+++ b/kernel/riscv64/dot.c
@@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
+
+#if defined(DSDOT)
 	double dot = 0.0 ;
+#else
+	FLOAT  dot = 0.0 ;
+#endif
 
 	if ( n < 1 )  return(dot);
 
 	while(i < n)
 	{
 
+#if defined(DSDOT)
+		dot += (double) y[iy] * (double) x[ix] ;
+#else
 		dot += y[iy] * x[ix] ;
+#endif
+
 		ix  += inc_x ;
 		iy  += inc_y ;
 		i++ ;
diff --git a/kernel/riscv64/dot_rvv.c b/kernel/riscv64/dot_rvv.c
new file mode 100644
index 000000000..837badf41
--- /dev/null
+++ b/kernel/riscv64/dot_rvv.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+    double dot = 0.0;
+
+    if ( n <= 0 ) return(dot);
+
+    size_t vlmax = __riscv_vsetvlmax_e64m8();
+    vfloat64m8_t vr = __riscv_vfmv_v_f_f64m8(0, vlmax);
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = __riscv_vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
+            vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
+
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
+            vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
+
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#endif
+        }
+
+    } else if (1 == inc_x) {
+            
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = __riscv_vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = __riscv_vle32_v_f32m4(x, vl);
+            vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
+
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = __riscv_vle64_v_f64m8(x, vl);
+            vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
+
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#endif
+        }
+    } else if (1 == inc_y) {
+            
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = __riscv_vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
+            vfloat32m4_t vy = __riscv_vle32_v_f32m4(y, vl);
+
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
+            vfloat64m8_t vy = __riscv_vle64_v_f64m8(y, vl);
+
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#endif
+        }
+    } else {
+            
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = __riscv_vsetvl_e64m8(n);
+
+#if !defined(DOUBLE)
+            vfloat32m4_t vx = __riscv_vlse32_v_f32m4(x, stride_x, vl);
+            vfloat32m4_t vy = __riscv_vlse32_v_f32m4(y, stride_y, vl);
+
+            vr = __riscv_vfwmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#else
+            vfloat64m8_t vx = __riscv_vlse64_v_f64m8(x, stride_x, vl);
+            vfloat64m8_t vy = __riscv_vlse64_v_f64m8(y, stride_y, vl);
+
+            vr = __riscv_vfmacc_vv_f64m8_tu(vr, vx, vy, vl);
+#endif
+        }
+    }
+
+    vfloat64m1_t vec_zero = __riscv_vfmv_v_f_f64m1(0, vlmax);
+    vfloat64m1_t vec_sum = __riscv_vfredusum_vs_f64m8_f64m1(vr, vec_zero, vlmax);
+    dot = __riscv_vfmv_f_s_f64m1_f64(vec_sum);
+
+    return(dot);
+}
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
index 64efc6c40..38ccc4778 100644
--- a/kernel/riscv64/dot_vector.c
+++ b/kernel/riscv64/dot_vector.c
@@ -27,31 +27,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
 #endif
 
 #if defined(DSDOT)
@@ -63,7 +69,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG i=0, j=0;
 	double dot = 0.0 ;
 
-	if ( n < 0 )  return(dot);
+	if ( n < 1 )  return(dot);
 
         FLOAT_V_T vr, vx, vy;
         unsigned int gvl = 0;
@@ -82,8 +88,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -93,13 +99,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }else if(inc_y == 1){
                 gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
-                 int stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
@@ -107,9 +113,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -119,14 +124,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }else if(inc_x == 1){
                 gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
-                 int stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
@@ -134,9 +138,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -146,15 +149,14 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }else{
                 gvl = VSETVL(n);
                 vr = VFMVVF_FLOAT(0, gvl);
-                 int stride_x = inc_x * sizeof(FLOAT);
-                 int stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
                         vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
@@ -162,9 +164,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         j += gvl;
                 }
                 if(j > 0){
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
                 //tail
                 if(j < n){
@@ -174,9 +175,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
                         FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
                         //vr = VFDOTVV_FLOAT(vx, vy, gvl);
                         vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        dot += (double)VFMVFS_FLOAT(v_res);
-
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                        dot += (double)EXTRACT_FLOAT(v_res);
                 }
         }
 	return(dot);
diff --git a/kernel/riscv64/dsdot_vector.c b/kernel/riscv64/dsdot_vector.c
new file mode 100644
index 000000000..e972828b5
--- /dev/null
+++ b/kernel/riscv64/dsdot_vector.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+        BLASLONG i=0, j=0;
+        double dot = 0.0 ;
+
+        if ( n < 1 )  return(dot);
+        vfloat64m4_t vr;
+        vfloat32m2_t vx, vy;
+        unsigned int gvl = 0;
+        vfloat64m1_t v_res, v_z0;
+        gvl = vsetvlmax_e64m1();
+        v_res = vfmv_v_f_f64m1(0, gvl);
+        v_z0 = vfmv_v_f_f64m1(0, gvl);
+
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+                }
+        }else if(inc_y == 1){
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                 int stride_x = inc_x * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vle32_v_f32m2(&y[j], gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                 int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vle32_v_f32m2(&x[j], gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+        }else{
+                gvl = vsetvl_e64m4(n);
+                vr = vfmv_v_f_f64m4(0, gvl);
+                 int stride_x = inc_x * sizeof(FLOAT);
+                 int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vr = vfwmacc_vv_f64m4(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvl_e64m4(n-j);
+                        vx = vlse32_v_f32m2(&x[j*inc_x], stride_x, gvl);
+                        vy = vlse32_v_f32m2(&y[j*inc_y], stride_y, gvl);
+                        vfloat64m4_t vz = vfmv_v_f_f64m4(0, gvl);
+                        //vr = vfdot_vv_f32m2(vx, vy, gvl);
+                        vr = vfwmacc_vv_f64m4(vz, vx, vy, gvl);
+                        v_res = vfredusum_vs_f64m4_f64m1(v_res, vr, v_z0, gvl);
+                        dot += (double)vfmv_f_s_f64m1_f64(v_res);
+
+                }
+        }
+        return(dot);
+}
diff --git a/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c
new file mode 100644
index 000000000..c1e0da86e
--- /dev/null
+++ b/kernel/riscv64/dtrmm_kernel_8x4_zvl128b.c
@@ -0,0 +1,660 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=4
+ M=8
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=4
+ VFMACC='__riscv_vfmacc_vf_f64m4'
+ VFMUL='__riscv_vfmul_vf_f64m4'
+ VLEV='__riscv_vle64_v_f64m4'
+ VLSEV='__riscv_vlse64_v_f64m4'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m4'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m4'
+ VSETVL='__riscv_vsetvl_e64m4'
+ VSEV='__riscv_vse64_v_f64m4'
+ VSSEV='__riscv_vsse64_v_f64m4'
+ acc_vector_t='vfloat64m4_t'
+ output='dtrmm_kernel_8x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m4_t'
+
+*/
+
+#include "common.h"
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m4(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
+            vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            double B2 = B[bi + 2];
+            double B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+            vfloat64m4_t result2 = __riscv_vfmul_vf_f64m4(A0, B2, gvl);
+            vfloat64m4_t result3 = __riscv_vfmul_vf_f64m4(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m4(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m4(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            vfloat64m4_t c2 = __riscv_vfmul_vf_f64m4(result2, alpha, gvl);
+            vfloat64m4_t c3 = __riscv_vfmul_vf_f64m4(result3, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            C[ci + 2 * ldc + 0] = alpha * result4;
+            C[ci + 2 * ldc + 1] = alpha * result5;
+            C[ci + 3 * ldc + 0] = alpha * result6;
+            C[ci + 3 * ldc + 1] = alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            C[ci + 2 * ldc + 0] = alpha * result2;
+            C[ci + 3 * ldc + 0] = alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            double B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+            vfloat64m4_t result1 = __riscv_vfmul_vf_f64m4(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m4(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            vfloat64m4_t c1 = __riscv_vfmul_vf_f64m4(result1, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse64_v_f64m4(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m4(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e64m4(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            double B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat64m4_t A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat64m4_t result0 = __riscv_vfmul_vf_f64m4(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m4(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m4(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m4_t c0 = __riscv_vfmul_vf_f64m4(result0, alpha, gvl);
+            __riscv_vse64_v_f64m4(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c
new file mode 100644
index 000000000..b1739f248
--- /dev/null
+++ b/kernel/riscv64/dtrmm_kernel_8x8_zvl256b.c
@@ -0,0 +1,1068 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='dtrmm_kernel_8x8_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+            vfloat64m1_t result8 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result9 = __riscv_vfmul_vf_f64m1( A1, B4, gvl);
+            vfloat64m1_t result10 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result11 = __riscv_vfmul_vf_f64m1( A1, B5, gvl);
+            vfloat64m1_t result12 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result13 = __riscv_vfmul_vf_f64m1( A1, B6, gvl);
+            vfloat64m1_t result14 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+            vfloat64m1_t result15 = __riscv_vfmul_vf_f64m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f64m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f64m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f64m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f64m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f64m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f64m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f64m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f64m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
+            vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
+            vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
+            vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
+            vfloat64m1_t c8 = __riscv_vfmul_vf_f64m1( result8, alpha, gvl );
+            vfloat64m1_t c9 = __riscv_vfmul_vf_f64m1( result9, alpha, gvl );
+            vfloat64m1_t c10 = __riscv_vfmul_vf_f64m1( result10, alpha, gvl );
+            vfloat64m1_t c11 = __riscv_vfmul_vf_f64m1( result11, alpha, gvl );
+            vfloat64m1_t c12 = __riscv_vfmul_vf_f64m1( result12, alpha, gvl );
+            vfloat64m1_t c13 = __riscv_vfmul_vf_f64m1( result13, alpha, gvl );
+            vfloat64m1_t c14 = __riscv_vfmul_vf_f64m1( result14, alpha, gvl );
+            vfloat64m1_t c15 = __riscv_vfmul_vf_f64m1( result15, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c15, gvl);
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            double B4 = B[bi+4];
+            double B5 = B[bi+5];
+            double B6 = B[bi+6];
+            double B7 = B[bi+7];
+            bi += 8;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B4, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A0, B5, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B6, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
+            vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
+            vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
+            vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            C[ci+4*ldc+0] = alpha * result8;
+            C[ci+4*ldc+1] = alpha * result9;
+            C[ci+5*ldc+0] = alpha * result10;
+            C[ci+5*ldc+1] = alpha * result11;
+            C[ci+6*ldc+0] = alpha * result12;
+            C[ci+6*ldc+1] = alpha * result13;
+            C[ci+7*ldc+0] = alpha * result14;
+            C[ci+7*ldc+1] = alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            C[ci+4*ldc+0] = alpha * result4;
+            C[ci+5*ldc+0] = alpha * result5;
+            C[ci+6*ldc+0] = alpha * result6;
+            C[ci+7*ldc+0] = alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+            vfloat64m1_t result4 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result5 = __riscv_vfmul_vf_f64m1( A1, B2, gvl);
+            vfloat64m1_t result6 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+            vfloat64m1_t result7 = __riscv_vfmul_vf_f64m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f64m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f64m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f64m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f64m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            vfloat64m1_t c4 = __riscv_vfmul_vf_f64m1( result4, alpha, gvl );
+            vfloat64m1_t c5 = __riscv_vfmul_vf_f64m1( result5, alpha, gvl );
+            vfloat64m1_t c6 = __riscv_vfmul_vf_f64m1( result6, alpha, gvl );
+            vfloat64m1_t c7 = __riscv_vfmul_vf_f64m1( result7, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            double B2 = B[bi+2];
+            double B3 = B[bi+3];
+            bi += 4;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B2, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+            vfloat64m1_t result2 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+            vfloat64m1_t result3 = __riscv_vfmul_vf_f64m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f64m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f64m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            vfloat64m1_t c2 = __riscv_vfmul_vf_f64m1( result2, alpha, gvl );
+            vfloat64m1_t c3 = __riscv_vfmul_vf_f64m1( result3, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse64_v_f64m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            double B1 = B[bi+1];
+            bi += 2;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            vfloat64m1_t A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+            ai += 8;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+            vfloat64m1_t result1 = __riscv_vfmul_vf_f64m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle64_v_f64m1( &A[ai+1*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f64m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            vfloat64m1_t c1 = __riscv_vfmul_vf_f64m1( result1, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse64_v_f64m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0 = B[bi+0];
+            bi += 1;
+
+            vfloat64m1_t A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat64m1_t result0 = __riscv_vfmul_vf_f64m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle64_v_f64m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f64m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t c0 = __riscv_vfmul_vf_f64m1( result0, alpha, gvl );
+            __riscv_vse64_v_f64m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/gemm_beta_rvv.c b/kernel/riscv64/gemm_beta_rvv.c
new file mode 100644
index 000000000..f3cf6491d
--- /dev/null
+++ b/kernel/riscv64/gemm_beta_rvv.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#endif
+
+// Optimizes the implementation in ../generic/gemm_beta.c
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
+          IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5,
+          FLOAT *c, BLASLONG ldc)
+{
+    BLASLONG chunk;
+    FLOAT *c_offset;
+	size_t vl;
+    FLOAT_V_T vx;
+
+    if (beta == ZERO) {
+
+        vl = VSETVL(m);
+        vx = VFMVVF_FLOAT(0.0, vl);
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
+                vl = VSETVL(chunk);
+
+                VSEV_FLOAT(c_offset, vx, vl);
+			}
+		}
+
+	} else {
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl) {
+                vl = VSETVL(chunk);
+
+                vx = VLEV_FLOAT(c_offset, vl);
+                vx = VFMULVF_FLOAT(vx, beta, vl);
+                VSEV_FLOAT(c_offset, vx, vl);
+			}
+		}
+
+	}
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_ncopy_8_rvv.c b/kernel/riscv64/gemm_ncopy_8_rvv.c
new file mode 100644
index 000000000..c652ab0c0
--- /dev/null
+++ b/kernel/riscv64/gemm_ncopy_8_rvv.c
@@ -0,0 +1,197 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VSET_VX2                __riscv_vset_v_f32m1_f32m1x2
+#define VSET_VX4                __riscv_vset_v_f32m1_f32m1x4
+#define VSET_VX8                __riscv_vset_v_f32m1_f32m1x8
+#define VLEV_FLOAT              __riscv_vle32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VSET_VX2                __riscv_vset_v_f64m1_f64m1x2
+#define VSET_VX4                __riscv_vset_v_f64m1_f64m1x4
+#define VSET_VX8                __riscv_vset_v_f64m1_f64m1x8
+#define VLEV_FLOAT              __riscv_vle64_v_f64m1
+#define VSEV_FLOAT              __riscv_vse64_v_f64m1
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
+#endif
+
+// Optimizes the implementation in ../generic/gemm_ncopy_8.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
+{
+    BLASLONG i, j;
+
+    FLOAT *a_offset;
+    FLOAT *a_offset1, *a_offset2, *a_offset3, *a_offset4;
+    FLOAT *a_offset5, *a_offset6, *a_offset7, *a_offset8;
+    FLOAT *b_offset;
+
+    FLOAT_V_T v1, v2, v3, v4, v5, v6, v7, v8;
+    FLOAT_VX2_T vx2;
+    FLOAT_VX4_T vx4;
+    FLOAT_VX8_T vx8;
+
+    size_t vl;
+
+    //fprintf(stderr, "gemm_ncopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
+
+    a_offset = a;
+    b_offset = b;
+
+    for(j = (n >> 3); j > 0; j--) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset3  = a_offset2 + lda;
+        a_offset4  = a_offset3 + lda;
+        a_offset5  = a_offset4 + lda;
+        a_offset6  = a_offset5 + lda;
+        a_offset7  = a_offset6 + lda;
+        a_offset8  = a_offset7 + lda;
+        a_offset += 8 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+            v3 = VLEV_FLOAT(a_offset3, vl);
+            v4 = VLEV_FLOAT(a_offset4, vl);
+            v5 = VLEV_FLOAT(a_offset5, vl);
+            v6 = VLEV_FLOAT(a_offset6, vl);
+            v7 = VLEV_FLOAT(a_offset7, vl);
+            v8 = VLEV_FLOAT(a_offset8, vl);
+
+            vx8 = VSET_VX8(vx8, 0, v1);
+            vx8 = VSET_VX8(vx8, 1, v2);
+            vx8 = VSET_VX8(vx8, 2, v3);
+            vx8 = VSET_VX8(vx8, 3, v4);
+            vx8 = VSET_VX8(vx8, 4, v5);
+            vx8 = VSET_VX8(vx8, 5, v6);
+            vx8 = VSET_VX8(vx8, 6, v7);
+            vx8 = VSET_VX8(vx8, 7, v8);
+
+            VSSEG8_FLOAT(b_offset, vx8, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            a_offset3 += vl;
+            a_offset4 += vl;
+            a_offset5 += vl;
+            a_offset6 += vl;
+            a_offset7 += vl;
+            a_offset8 += vl;
+            b_offset += vl*8;
+        }
+    }
+
+    if (n & 4) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset3  = a_offset2 + lda;
+        a_offset4  = a_offset3 + lda;
+        a_offset += 4 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+            v3 = VLEV_FLOAT(a_offset3, vl);
+            v4 = VLEV_FLOAT(a_offset4, vl);
+
+            vx4 = VSET_VX4(vx4, 0, v1);
+            vx4 = VSET_VX4(vx4, 1, v2);
+            vx4 = VSET_VX4(vx4, 2, v3);
+            vx4 = VSET_VX4(vx4, 3, v4);
+
+            VSSEG4_FLOAT(b_offset, vx4, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            a_offset3 += vl;
+            a_offset4 += vl;
+            b_offset += vl*4;
+        }
+    }
+
+    if (n & 2) {
+        a_offset1  = a_offset;
+        a_offset2  = a_offset1 + lda;
+        a_offset += 2 * lda;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+            v2 = VLEV_FLOAT(a_offset2, vl);
+
+            vx2 = VSET_VX2(vx2, 0, v1);
+            vx2 = VSET_VX2(vx2, 1, v2);
+
+            VSSEG2_FLOAT(b_offset, vx2, vl);
+
+            a_offset1 += vl;
+            a_offset2 += vl;
+            b_offset += vl*2;
+        }
+    }
+
+    if (n & 1) {
+        a_offset1  = a_offset;
+
+        for(i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            v1 = VLEV_FLOAT(a_offset1, vl);
+
+            VSEV_FLOAT(b_offset, v1, vl);
+
+            a_offset1 += vl;
+            b_offset += vl;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_ncopy_rvv_v1.c b/kernel/riscv64/gemm_ncopy_rvv_v1.c
new file mode 100644
index 000000000..2d6db15e5
--- /dev/null
+++ b/kernel/riscv64/gemm_ncopy_rvv_v1.c
@@ -0,0 +1,76 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
+{
+    BLASLONG i, j;
+
+    FLOAT *a_offset;
+    FLOAT *a_offset1;
+    FLOAT *b_offset;
+
+    FLOAT_V_T v0;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    a_offset = a;
+    b_offset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        a_offset1 = a_offset;
+        a_offset += vl * lda;
+
+        for(i = m; i > 0; i--) {
+            v0 = VLSEV_FLOAT(a_offset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(b_offset, v0, vl);
+
+            a_offset1++;
+            b_offset += vl;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_8_rvv.c b/kernel/riscv64/gemm_tcopy_8_rvv.c
new file mode 100644
index 000000000..4742ae6a7
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_8_rvv.c
@@ -0,0 +1,273 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m1
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1x4
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1x8
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m1
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m1
+#define VSEV_FLOAT              __riscv_vse64_v_f64m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1x4
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1x8
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+
+    IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
+
+    FLOAT_V_T v0;
+    FLOAT_VX2_T vx2;
+    FLOAT_VX4_T vx4;
+    FLOAT_VX8_T vx8;
+
+    // fprintf(stderr, "gemm_tcopy_8 m=%ld n=%ld lda=%ld\n", m, n, lda);
+
+    aoffset   = a;
+    boffset   = b;
+    boffset2  = b + m  * (n & ~7);
+    boffset3  = b + m  * (n & ~3);
+    boffset4  = b + m  * (n & ~1);
+
+    for(j = (m >> 3); j > 0; j--) {
+
+        aoffset1  = aoffset;
+        aoffset += 8 * lda;
+
+        boffset1  = boffset;
+        boffset  += 64;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 8;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 4) {
+            size_t vl = 8;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 32;
+        }
+
+        if (n & 2) {
+            size_t vl = 8;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            aoffset1 += 2;
+            boffset3 += 16;
+        }
+
+        if (n & 1) {
+            size_t vl = 8;
+
+            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(boffset4, v0, vl);
+
+            aoffset1 += 1;
+            boffset4 += 8;
+        }
+
+    }
+
+    if (m & 4) {
+
+        aoffset1  = aoffset;
+        aoffset += 4 * lda;
+
+        boffset1  = boffset;
+        boffset  += 32;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 4;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 4) {
+            size_t vl = 4;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 16;
+        }
+
+        if (n & 2) {
+            size_t vl = 4;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            aoffset1 += 2;
+            boffset3 += 8;
+	}
+
+        if (n & 1) {
+            size_t vl = 4;
+
+            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(boffset4, v0, vl);
+
+            aoffset1 += 1;
+            boffset4 += 4;
+        }
+    }
+
+    if (m & 2) {
+        aoffset1  = aoffset;
+        aoffset += 2 * lda;
+
+        boffset1  = boffset;
+        boffset  += 16;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 2;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 4) {
+            size_t vl = 2;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 8;
+        }
+
+        if (n & 2) {
+            size_t vl = 2;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            aoffset1 += 2;
+            boffset3 += 4;
+	}
+  
+        if (n & 1) {
+           size_t vl = 2;
+
+            v0 = VLSEV_FLOAT(aoffset1, lda * sizeof(FLOAT), vl);
+            VSEV_FLOAT(boffset4, v0, vl);
+
+            aoffset1 += 1;
+            boffset4 += 2;
+        }
+    }
+
+    if (m & 1) {
+        aoffset1  = aoffset;
+        boffset1  = boffset;
+
+        for(i = (n >> 3); i > 0; i--) {
+            size_t vl = 8;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v0, vl);
+
+            aoffset1 += 8;
+            boffset1 += 8 * m;
+        }
+
+        if (n & 4) {
+            size_t vl = 4;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v0, vl);
+
+            aoffset1 += 4;
+            //boffset2 += 4;
+        }
+
+        if (n & 2) {
+            size_t vl = 2;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset3, v0, vl);
+
+            aoffset1 += 2;
+           // boffset3 += 2;
+        }
+
+        if (n & 1) {
+           *(boffset4) = *(aoffset1);
+           // aoffset1 ++;
+           // boffset4 ++;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemm_tcopy_rvv_v1.c b/kernel/riscv64/gemm_tcopy_rvv_v1.c
new file mode 100644
index 000000000..c5fb6479f
--- /dev/null
+++ b/kernel/riscv64/gemm_tcopy_rvv_v1.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+    IFLOAT *boffset;
+
+    FLOAT_V_T v0;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    aoffset = a;
+    boffset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        aoffset1 = aoffset;
+        aoffset += vl;
+
+        for(i = m; i > 0; i--) {
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset, v0, vl);
+
+            aoffset1 += lda;
+            boffset += vl;
+        }  
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemmkernel_rvv_v1x8.c b/kernel/riscv64/gemmkernel_rvv_v1x8.c
new file mode 100644
index 000000000..471b3158f
--- /dev/null
+++ b/kernel/riscv64/gemmkernel_rvv_v1x8.c
@@ -0,0 +1,601 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#endif
+
+int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7;
+    IFLOAT *ptrba,*ptrbb;
+
+    //fprintf(stderr, "%s, bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", __FUNCTION__, bm, bn, bk, alpha, ldc); // Debug
+
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    size_t vl;
+
+    // N:8
+    for (j = bn/8; j > 0; j--) {
+        C0 = C;
+        C1 = C0 + ldc;
+        C2 = C1 + ldc;
+        C3 = C2 + ldc;
+        C4 = C3 + ldc;
+        C5 = C4 + ldc;
+        C6 = C5 + ldc;
+        C7 = C6 + ldc;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrba += vl;
+                ptrbb += 8;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+ 
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+                ptrbb += 8;
+  
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
+                ptrbb += 8;
+
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
+                ptrbb += 8;
+
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
+                ptrbb += 8;
+
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
+                ptrbb += 8;
+
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
+                ptrbb += 8;
+
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
+                ptrbb += 8;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
+                ptrbb += 8;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VLEV_FLOAT(C1, vl);
+            va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VLEV_FLOAT(C2, vl);
+            va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VLEV_FLOAT(C3, vl);
+            va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+            va4 = VLEV_FLOAT(C4, vl);
+            va4 = VFMACCVF_FLOAT(va4, alpha, vres4, vl);
+            VSEV_FLOAT(C4, va4, vl);
+
+            va5 = VLEV_FLOAT(C5, vl);
+            va5 = VFMACCVF_FLOAT(va5, alpha, vres5, vl);
+            VSEV_FLOAT(C5, va5, vl);
+
+            va6 = VLEV_FLOAT(C6, vl);
+            va6 = VFMACCVF_FLOAT(va6, alpha, vres6, vl);
+            VSEV_FLOAT(C6, va6, vl);
+
+            va7 = VLEV_FLOAT(C7, vl);
+            va7 = VFMACCVF_FLOAT(va7, alpha, vres7, vl);
+            VSEV_FLOAT(C7, va7, vl);
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+            C4 += vl;
+            C5 += vl;
+            C6 += vl;
+            C7 += vl;
+        }
+
+        bb += (bk<<3);
+        C += (ldc<<3);
+    }
+
+    // N:4
+    if (bn & 4) {
+        C0 = C;
+        C1 = C0 + ldc;
+        C2 = C1 + ldc;
+        C3 = C2 + ldc;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrba += vl;
+                ptrbb += 4;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                ptrbb += 4;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+  
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                ptrbb += 4;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                ptrbb += 4;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                ptrbb += 4;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                ptrbb += 4;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                ptrbb += 4;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                ptrbb += 4;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                ptrbb += 4;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VLEV_FLOAT(C1, vl);
+            va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VLEV_FLOAT(C2, vl);
+            va2 = VFMACCVF_FLOAT(va2, alpha, vres2, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VLEV_FLOAT(C3, vl);
+            va3 = VFMACCVF_FLOAT(va3, alpha, vres3, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+        }
+
+        bb += (bk<<2);
+        C += (ldc<<2);
+    }
+
+    // N:2
+    if (bn & 2) {
+        C0 = C;
+        C1 = C0 + ldc;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrba += vl;
+                ptrbb += 2;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                ptrbb += 2;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                ptrbb += 2;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                ptrbb += 2;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                ptrbb += 2;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                ptrbb += 2;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl; 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                ptrbb += 2;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                ptrbb += 2;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VLEV_FLOAT(C1, vl);
+            va1 = VFMACCVF_FLOAT(va1, alpha, vres1, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            C0 += vl;
+            C1 += vl;
+        }
+
+        bb += (bk<<1);
+        C += (ldc<<1);
+    }
+
+    // N:1
+    if (bn & 1) {
+        C0 = C;
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) {
+            vl = VSETVL(i);
+
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+#if 0
+            for (k = bk; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); 
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+ 
+                ptrba += vl;
+                ptrbb += 1;
+            }
+#else
+            // Unroll K
+            for (k = bk/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                ptrbb += 1;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                ptrbb += 1;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                ptrbb += 1;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                ptrbb += 1;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                ptrbb += 1;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                ptrbb += 1;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                ptrbb += 1;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                ptrbb += 1;
+            }
+
+            // K remainder
+            for (k = bk&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+
+                ptrbb += 1;
+                ptrba += vl;
+            }
+#endif
+            va0 = VLEV_FLOAT(C0, vl);
+            va0 = VFMACCVF_FLOAT(va0, alpha, vres0, vl);
+            VSEV_FLOAT(C0, va0, vl);
+  
+            C0 += vl;
+        }
+
+        bb += (bk);
+        C += (ldc);
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/gemv_n_rvv.c b/kernel/riscv64/gemv_n_rvv.c
new file mode 100644
index 000000000..1366eb5ad
--- /dev/null
+++ b/kernel/riscv64/gemv_n_rvv.c
@@ -0,0 +1,94 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    if(n < 0)  return(0);
+
+    FLOAT *a_ptr, *x_ptr;
+    BLASLONG i;
+    FLOAT_V_T va, vy;
+
+    if(inc_y == 1) {
+
+        for (size_t vl; m > 0; m -= vl, y += vl, a += vl) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            x_ptr = x;
+            vy = VLEV_FLOAT(y, vl);
+            for(i = 0; i < n; i++) {
+                va = VLEV_FLOAT(a_ptr, vl);
+                vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+            VSEV_FLOAT(y, vy, vl);
+        }
+ 
+    } else {
+
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; m > 0; m -= vl, y += vl*inc_y, a += vl) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            x_ptr = x;
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            for(i = 0; i < n; i++) {
+                va = VLEV_FLOAT(a_ptr, vl);
+                vy = VFMACCVF_FLOAT(vy, (alpha * (*x_ptr)), va, vl);
+
+                a_ptr += lda;
+                x_ptr += inc_x;
+            }
+            VSSEV_FLOAT(y, stride_y, vy, vl);
+        }
+
+    }
+    return(0);
+}
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
index 32ca8618b..aa13fc87d 100644
--- a/kernel/riscv64/gemv_n_vector.c
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -27,21 +27,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/gemv_t_rvv.c b/kernel/riscv64/gemv_t_rvv.c
new file mode 100644
index 000000000..9c859aa50
--- /dev/null
+++ b/kernel/riscv64/gemv_t_rvv.c
@@ -0,0 +1,118 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i, j;
+    FLOAT *a_ptr, *x_ptr;
+
+    FLOAT_V_T va, vx, vr;
+    FLOAT_V_T_M1 v_res, v_z0;
+    size_t vlmax = VSETVL_MAX_M1;
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+    vlmax = VSETVL_MAX;
+
+    if(inc_x == 1) {
+
+        for(i = 0; i < n; i++) {
+            j = m;
+            a_ptr = a;
+            x_ptr = x;
+            vr = VFMVVF_FLOAT(0, vlmax);
+
+            for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl) {
+                vl = VSETVL(j);
+
+                va = VLEV_FLOAT(a_ptr, vl);
+                vx = VLEV_FLOAT(x_ptr, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl);
+            }
+
+            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+            *y += alpha * VFMVFS_FLOAT_M1(v_res);
+            y += inc_y;
+            a += lda;
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+  
+        for(i = 0; i < n; i++) {
+            j = m;
+            a_ptr = a;
+            x_ptr = x;
+            vr = VFMVVF_FLOAT(0, vlmax);
+
+            for (size_t vl; j > 0; j -= vl, a_ptr += vl, x_ptr += vl*inc_x) {
+                vl = VSETVL(j);
+
+                va = VLEV_FLOAT(a_ptr, vl);
+                vx = VLSEV_FLOAT(x_ptr, stride_x, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va, vx, vl);
+            }
+
+            v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+            *y += alpha * VFMVFS_FLOAT_M1(v_res);
+            y += inc_y;
+            a += lda;
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
index 7683641fa..62b85164c 100644
--- a/kernel/riscv64/gemv_t_vector.c
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -27,107 +27,110 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n)
+#define FLOAT_V_T vfloat32m2_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredosum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m2)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m2_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m2_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2)
+#define xint_t int
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n)
+#define FLOAT_V_T vfloat64m2_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m2)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m2_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m2_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2)
+#define xint_t long long
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
-	    BLASLONG i = 0, j = 0, k = 0;
-	    BLASLONG ix = 0, iy = 0;
-	    FLOAT *a_ptr = a;
+	BLASLONG i = 0, j = 0, k = 0;
+	BLASLONG ix = 0, iy = 0;
+	FLOAT *a_ptr = a;
         FLOAT temp;
 
         FLOAT_V_T va, vr, vx;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+
 
         if(inc_x == 1){
                 for(i = 0; i < n; i++){
+                        v_res = VFMVVF_FLOAT_M1(0, 1);
                         gvl = VSETVL(m);
                         j = 0;
                         vr = VFMVVF_FLOAT(0, gvl);
                         for(k = 0; k < m/gvl; k++){
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLEV_FLOAT(&x[j], gvl);
-                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);                // could vfmacc here and reduce outside loop
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);         // but that reordering diverges far enough from scalar path to make tests fail
                                 j += gvl;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        temp = (FLOAT)VFMVFS_FLOAT(v_res);
                         if(j < m){
                                 gvl = VSETVL(m-j);
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLEV_FLOAT(&x[j], gvl);
                                 vr = VFMULVV_FLOAT(va, vx, gvl);
-
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp += (FLOAT)VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
                         }
+                        temp = (FLOAT)EXTRACT_FLOAT(v_res);
                         y[iy] += alpha * temp;
+
+
                         iy += inc_y;
                         a_ptr += lda;
                 }
         }else{
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
-                
                 for(i = 0; i < n; i++){
+                        v_res = VFMVVF_FLOAT_M1(0, 1);
                         gvl = VSETVL(m);
-						BLASLONG inc_xv = inc_x * gvl;
                         j = 0;
                         ix = 0;
                         vr = VFMVVF_FLOAT(0, gvl);
                         for(k = 0; k < m/gvl; k++){
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
                                 j += gvl;
-                                ix += inc_xv;
+                                ix += inc_x * gvl;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        temp = (FLOAT)VFMVFS_FLOAT(v_res);
                         if(j < m){
                                 gvl = VSETVL(m-j);
                                 va = VLEV_FLOAT(&a_ptr[j], gvl);
                                 vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                 vr = VFMULVV_FLOAT(va, vx, gvl);
-
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp += (FLOAT)VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_res, gvl);
                         }
+                        temp = (FLOAT)EXTRACT_FLOAT(v_res);
                         y[iy] += alpha * temp;
+
+
                         iy += inc_y;
                         a_ptr += lda;
                 }
         }
+
+
 	return(0);
 }
-
diff --git a/kernel/riscv64/generate_kernel.py b/kernel/riscv64/generate_kernel.py
new file mode 100755
index 000000000..8be7c9f9c
--- /dev/null
+++ b/kernel/riscv64/generate_kernel.py
@@ -0,0 +1,673 @@
+#!/usr/bin/python3
+
+import sys, os
+import contextlib
+
+#-----------------------------------------------------------------------
+def ERROR(*args, **kwargs):
+    print(*args, file=sys.stderr, **kwargs)
+    sys.exit(-1)
+
+class Target(object):
+    def __init__( self, out, mappings, initial_level=0, tab_width=4 ):
+        self._level = initial_level
+        self._tab_width = tab_width
+        self._out = out
+        self._mappings = mappings
+
+    @contextlib.contextmanager
+    def map( self, **items ):
+        old_mappings = self._mappings
+        self._mappings = dict(old_mappings, **items)
+        yield self._mappings
+        self._mappings = old_mappings
+
+    @contextlib.contextmanager
+    def block( self, start=None, end=None, **args ):
+        with self.map(**args):
+            if start is not None:
+                self.write();
+                self.write(start)
+            self._level += 1
+            yield self._level
+            self._level -= 1
+            if end is not None:
+                self.write(end)
+                self.write()
+
+    def write( self, fmt=None, *args, **kwargs ):
+        if fmt is not None:
+            mappings = dict(self._mappings, **kwargs) if kwargs else self._mappings
+            self._out(self._indent_str() + fmt.format(*args, **mappings))
+        else:
+            self._out("")
+
+    def _indent_str( self ):
+        return ' ' * (self._level * self._tab_width)
+
+#-----------------------------------------------------------------------
+def generate_trmm_block( dest ):
+    dest.write("{index_type} pass_K = K;")
+    dest.write("#ifdef LEFT")
+    with dest.block():
+        dest.write("{index_type} off = offset + m_top;")
+    dest.write("#else")
+    with dest.block():
+        dest.write("{index_type} off = -offset + n_top;")
+    dest.write("#endif")
+
+    dest.write("#ifdef BACKWARDS")
+    with dest.block():
+        dest.write("ai += off*{M}{elt_size};")
+        dest.write("bi += off*{N}{elt_size};")
+        dest.write("pass_K -= off;")
+    dest.write("#else")
+    with dest.block():
+        dest.write("#ifdef LEFT")
+        with dest.block():
+            dest.write("pass_K = off + {M};")
+        dest.write("#else")
+        with dest.block():
+            dest.write("pass_K = off + {N};")
+        dest.write("#endif")
+    dest.write("#endif")
+
+#-----------------------------------------------------------------------
+def generate_gemm_kernel_inner_real( settings, dest, M, N, vlen, a_regs ):
+    TRMM           = (settings['op'].value == 'trmm')
+    narrow_result  = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
+
+    with dest.map( 
+        M=M, 
+        N=N, 
+    ):
+        dest.write("{index_type} ai=m_top*K{elt_size};")
+        dest.write("{index_type} bi=n_top*K{elt_size};")
+        if TRMM:
+            generate_trmm_block( dest )
+
+        for i in range(N):
+            dest.write("{param_scalar_t} B{i} = B[bi+{i}];", i=i)
+        dest.write("bi += {N};")
+        dest.write()
+
+        for i in range(a_regs):
+            dest.write("{param_vector_t} A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
+        dest.write("ai += {M};")
+        dest.write()
+
+        for j in range(N):
+            for i in range(a_regs):
+                dest.write("{acc_vector_t} result{dest} = {VMUL_TO_ACC}( A{i}, B{j}, gvl);", dest=j*a_regs+i, i=i, j=j)
+
+        with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
+            for i in range(N):
+                dest.write("B{i} = B[bi+{i}];", i=i )
+            dest.write("bi += {N};")
+            dest.write()
+
+            for i in range(a_regs):
+                dest.write("A{i} = {VLEV}( &A[ai+{i}*gvl], gvl );", i=i)
+
+            dest.write("ai += {M};")
+            dest.write()
+
+
+            for j in range(N):
+                for i in range(a_regs):
+                    dest.write("result{dest} = {VMACC_TO_ACC}( result{dest}, B{j}, A{i}, gvl);", dest= j*a_regs+i, j=j, i=i )
+
+        dest.write()
+        dest.write("{index_type} ci=n_top*ldc+m_top;")
+        dest.write()
+
+        if narrow_result:
+            for j in range(N):
+                for i in range(a_regs):
+                    dest.write("{param_vector_t} narrowed{idx} = {VFNCVT}( result{idx}, gvl );", idx=j*a_regs+i)
+
+        if not TRMM:
+            for j in range(N):
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                    if idx == N*a_regs-1:
+                        increment = ''
+                    dest.write("{param_vector_t} c{idx} = {VLEV}( &C[ci], gvl);{increment}", idx=idx, increment=increment)
+
+        if narrow_result:
+            for j in range(N):
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    if TRMM:
+                        dest.write("{param_vector_t} c{idx} = {VFMUL}( narrowed{idx}, alpha, gvl );", idx=idx)
+                    else:
+                        dest.write("c{idx} = {VFMACC}( c{idx}, alpha, narrowed{idx}, gvl );", idx=idx)
+        else:
+            for j in range(N):
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    if TRMM:
+                        dest.write("{param_vector_t} c{idx} = {VFMUL}( result{idx}, alpha, gvl );", idx=idx)
+                    else:
+                        dest.write("c{idx} = {VFMACC}( c{idx}, alpha, result{idx}, gvl );", idx=idx)
+            
+
+        if not TRMM:
+            dest.write()
+            dest.write("ci=n_top*ldc+m_top;")
+            dest.write()
+
+        for j in range(N):
+            for i in range(a_regs):
+                idx = j*a_regs+i
+                increment = ' ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                if idx == N*a_regs-1:
+                    increment = ''
+                dest.write("{VSEV}( &C[ci], c{idx}, gvl);{increment}", idx=idx, increment=increment)
+
+
+#-----------------------------------------------------------------------
+def generate_gemm_kernel_inner_complex( settings, dest, M, N, vlen, a_regs ):
+    TRMM           = (settings['op'].value == 'trmm')
+    narrow_result  = (settings['param_precision'].value != 'double') and settings['force_acc_double'].value
+
+    if narrow_result:
+        raise RuntimeError("wide accumulator not supported for generated complex kernels")
+        # we could, but we run out of registers really really fast
+
+    with dest.map( 
+        M=M, 
+        N=N, 
+    ):
+        dest.write("{index_type} ai=m_top*K*2;")
+        dest.write("{index_type} bi=n_top*K*2;")
+        if TRMM:
+            generate_trmm_block( dest )
+
+        for i in range(N):
+            dest.write("{param_scalar_t} B{i}r = B[bi+{i}*2+0];", i=i)
+            dest.write("{param_scalar_t} B{i}i = B[bi+{i}*2+1];", i=i)
+        dest.write("bi += {N}*2;")
+        dest.write()
+
+        for i in range(a_regs):
+            dest.write("{param_vector_t} A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
+            dest.write("{param_vector_t} A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
+        dest.write("ai += {M}*2;")
+        dest.write()
+
+        # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results
+        accumulation_regs = a_regs * N
+        dest.write("// {a_regs} vector regs to hold A array contents, {accumulation_regs} regs to hold values accumulated over k",
+                a_regs=a_regs*2, accumulation_regs=accumulation_regs*2
+            )
+        pass_regs = (accumulation_regs + a_regs)*2
+        tmp_regs = (32 // settings['LMUL_ACC'].value) - pass_regs
+        if tmp_regs < 2:
+            raise RuntimeError("Complex kernel would use too many registers!")
+
+        dest.write("// leaving {tmp_regs} vector registers for temporaries", tmp_regs=tmp_regs)
+
+        tmp_unroll_i = min(tmp_regs, a_regs)
+        tmp_unroll_j = N
+        while tmp_unroll_j > 1 and (tmp_regs/(tmp_unroll_i*2)) < tmp_unroll_j:
+            tmp_unroll_j = int(tmp_unroll_j / 2)
+
+        if tmp_unroll_i < a_regs or tmp_unroll_j < N:
+            dest.write("// performing {ops} operations between reuses of temporaries", ops=tmp_unroll_j*tmp_unroll_i)
+
+        for tj in range(0, N, tmp_unroll_j):
+            for ti in range(0, a_regs, tmp_unroll_i):
+                for j in range(tj, tj+tmp_unroll_j):
+                    for i in range(ti, ti+tmp_unroll_i):
+                        with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                            if ti == 0 and tj==0:
+                                dest.write("{acc_vector_t} tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
+                                dest.write("{acc_vector_t} tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
+                            else:
+                                dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
+                                dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
+                for j in range(tj, tj+tmp_unroll_j):
+                    for i in range(ti, ti+tmp_unroll_i):
+                        with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                            dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
+                            dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
+
+                for j in range(tj, tj+tmp_unroll_j):
+                    for i in range(ti, ti+tmp_unroll_i):
+                        with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                            dest.write("{acc_vector_t} ACC{dest}r = tmp{tmp}r;")
+                            dest.write("{acc_vector_t} ACC{dest}i = tmp{tmp}i;")
+
+        with dest.block("for({index_type} k=1; k<{Kend}; k++) {{", "}}", Kend=('pass_K' if TRMM else 'K')):
+            for i in range(N):
+                dest.write("B{i}r = B[bi+{i}*2+0];", i=i)
+                dest.write("B{i}i = B[bi+{i}*2+1];", i=i)
+            dest.write("bi += {N}*2;")
+            dest.write()
+
+            for i in range(a_regs):
+                dest.write("A{i}r = {VLSEV}( &A[ai+{i}*gvl*2], sizeof(FLOAT)*2, gvl );", i=i)
+                dest.write("A{i}i = {VLSEV}( &A[ai+{i}*gvl*2+1], sizeof(FLOAT)*2, gvl );", i=i)
+
+            dest.write("ai += {M}*2;")
+            dest.write()
+
+
+            for tj in range(0, N, tmp_unroll_j):
+                for ti in range(0, a_regs, tmp_unroll_i):
+                    # note the values in tmp{tmp}* are frequently of similar magnitude and opposite sign
+                    # so accumulating them directly to ACC would lose precision when ACC is larger
+
+                    for j in range(tj, tj+tmp_unroll_j):
+                        for i in range(ti, ti+tmp_unroll_i):
+                            with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                                dest.write("tmp{tmp}r = {VMUL_TO_ACC}( A{i}i, B{j}i, gvl);")
+                                dest.write("tmp{tmp}i = {VMUL_TO_ACC}( A{i}r, B{j}i, gvl);")
+                    for j in range(tj, tj+tmp_unroll_j):
+                        for i in range(ti, ti+tmp_unroll_i):
+                            with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                                dest.write("tmp{tmp}r = VFMACC_RR( tmp{tmp}r, B{j}r, A{i}r, gvl);")
+                                dest.write("tmp{tmp}i = VFMACC_RI( tmp{tmp}i, B{j}r, A{i}i, gvl);")
+                    for j in range(tj, tj+tmp_unroll_j):
+                        for i in range(ti, ti+tmp_unroll_i):
+                            with dest.map(dest=j*a_regs+i, tmp=(i-ti)+tmp_unroll_i*(j-tj), i=i, j=j):
+                                dest.write("ACC{dest}r = {__riscv_}vfadd( ACC{dest}r, tmp{tmp}r, gvl);")
+                                dest.write("ACC{dest}i = {__riscv_}vfadd( ACC{dest}i, tmp{tmp}i, gvl);")
+
+        dest.write()
+        dest.write("{index_type} ci=n_top*ldc+m_top;")
+        dest.write()
+
+        for j in range(N):
+            if TRMM:
+                for i in range(a_regs):
+                    with dest.map(idx=j*a_regs+i):
+                        dest.write("{param_vector_t} C{idx}r = {__riscv_}vfmul( ACC{idx}r, alphar, gvl );")
+                        dest.write("{param_vector_t} C{idx}i = {__riscv_}vfmul( ACC{idx}i, alphar, gvl );")
+            else:
+                for i in range(a_regs):
+                    idx = j*a_regs+i
+                    increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                    if idx == N*a_regs-1:
+                        increment = ''                    
+                    with dest.map(idx=j*a_regs+i, increment=increment):
+                        dest.write("{param_vector_t} C{idx}r = {VLSEV}( &C[ci*2+0], sizeof(FLOAT)*2, gvl );")
+                        dest.write("{param_vector_t} C{idx}i = {VLSEV}( &C[ci*2+1], sizeof(FLOAT)*2, gvl );")
+                        dest.write("{increment}")
+
+        if not TRMM:
+            for j in range(N):
+                for i in range(a_regs):
+                    with dest.map(idx=j*a_regs+i):
+                        dest.write("C{idx}r = {__riscv_}vfmacc( C{idx}r, alphar, ACC{idx}r, gvl );")
+                        dest.write("C{idx}i = {__riscv_}vfmacc( C{idx}i, alphar, ACC{idx}i, gvl );")
+
+        for j in range(N):
+            for i in range(a_regs):
+                with dest.map(idx=j*a_regs+i):
+                    dest.write("C{idx}r = {__riscv_}vfnmsac( C{idx}r, alphai, ACC{idx}i, gvl );")
+                    dest.write("C{idx}i = {__riscv_}vfmacc ( C{idx}i, alphai, ACC{idx}r, gvl );")
+
+        if not TRMM:
+            dest.write()
+            dest.write("ci=n_top*ldc+m_top;")
+            dest.write()
+
+        for j in range(N):
+            for i in range(a_regs):
+                idx = j*a_regs+i
+                increment = 'ci += ldc-gvl*{};'.format(a_regs-1) if (i == a_regs-1) else ' ci += gvl;'
+                if idx == N*a_regs-1:
+                    increment = ''                    
+                with dest.map(idx=j*a_regs+i, increment=increment):
+                    dest.write("{VSSEV}( &C[ci*2+0], sizeof(FLOAT)*2, C{idx}r, gvl);")
+                    dest.write("{VSSEV}( &C[ci*2+1], sizeof(FLOAT)*2, C{idx}i, gvl);")
+                    dest.write("{increment}")
+
+#-----------------------------------------------------------------------
+def generate_gemm_kernel( settings, OUTPUT ):
+    if settings['conjugate'].value:
+        ERROR('conjugate gemm not yet supported')
+
+    is_complex = settings['complex'].value
+    generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
+    dest = Target(OUTPUT, { k:str(settings[k].value) for k in settings })
+
+    M = settings['M'].value
+    N = settings['N'].value
+    vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value /
+                  settings['ELEN_PARAM'].value)
+    a_regs = max(int(M/vlenmax), 1)
+
+    # for each vector register loaded from matrix A, we require N registers to hold vector-scalar multiply-accumulate results
+    accumulation_regs = a_regs * N
+    required_regs = accumulation_regs + a_regs
+    if is_complex:
+        required_regs = required_regs * 2 + 2
+        dest.write('''
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac{tail_policy}
+    #define VFMACC_RI __riscv_vfmacc{tail_policy}
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc{tail_policy}
+    #define VFMACC_RI __riscv_vfmsac{tail_policy}
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc{tail_policy}
+    #define VFMACC_RI __riscv_vfnmsac{tail_policy}
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac{tail_policy}
+    #define VFMACC_RI __riscv_vfnmacc{tail_policy}
+#endif
+'''.format(tail_policy=settings['tail_policy'].value))
+
+
+    if required_regs > (32 // settings['LMUL_ACC'].value):
+        raise Exception("{} vector registers needed during accumulation for unrolling {} x {}{} but only {} are available".format(
+            required_regs, N, M, (" with wide accumulator" if settings['LMUL_ACC'].value > 1 else ''), 32 // settings['LMUL_ACC'].value
+            ))
+
+    TRMM = (settings['op'].value == 'trmm')
+    if TRMM:
+        with dest.block("#if defined(LEFT) != defined(TRANSA)", "#endif"):
+            dest.write("#define BACKWARDS")
+
+    dest.write("int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, {alpha}, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc{trmm})",
+            alpha = ('FLOAT alphar, FLOAT alphai' if is_complex else 'FLOAT alpha'),
+            trmm = (', BLASLONG offset' if TRMM else '')
+        )
+
+    with dest.block("{{", "}}", elt_size='*2' if is_complex else ''):
+        if settings['trace'].value:
+            dest.write("printf(\"\\n\\nENTRY: %s(%d) M %d N %d K %d ldc %d\\n\", __FILE__, __LINE__, M, N, K, ldc);")
+        dest.write("{index_type} gvl = 0;")
+        dest.write("{index_type} m_top = 0;")
+        dest.write("{index_type} n_top = 0;")
+
+        dest.write()
+        dest.write()
+        dest.write("// -- MAIN PASS")
+
+        with dest.block("for ({index_type} j=0; j<N/{N}; j+=1) {{", "}}"):
+            dest.write("m_top = 0;")
+            dest.write("{index_type} gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
+            dest.write()
+            with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
+                generate_gemm_kernel_inner( settings, dest, M, N, vlenmax, a_regs )
+                dest.write( "m_top += {M};" )
+
+            dest.write()
+            dest.write()
+            dest.write("// -- tails for main pass")
+            generate_M_tails( dest, settings, M, N )
+
+            dest.write( "n_top += {N};" )
+
+
+        N_tail = int(N/2)
+        while( N_tail > 0 ):
+            with dest.map(N=N_tail):
+                dest.write()
+                dest.write()
+                dest.write("// -- tails for N={N}")
+                with dest.block("if( N & {N} ) {{", "}}" ):
+                    if settings['trace'].value:
+                        dest.write("printf(\"N tail entry: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
+                    dest.write("gvl = {VSETVL}({vlenmax});", vlenmax=min(vlenmax,max(int(M/a_regs),1)))
+                    dest.write("m_top = 0;")
+                    with dest.block("for ({index_type} i=0; i<M/{M}; i+=1) {{", "}}"):
+                        generate_gemm_kernel_inner( settings, dest, M, N_tail, vlenmax, a_regs )
+                        dest.write("m_top += {M};")
+
+                    generate_M_tails( dest, settings, M, N_tail )
+                    dest.write("n_top += {N};")
+            N_tail = int(N_tail/2)
+
+        dest.write("return 0;");
+
+
+#-----------------------------------------------------------------------
+def generate_M_tails( dest, settings, M, N ):
+    M_tail = int(M/2)
+    M_tail_min = settings['M_tail_scalar_from'].value
+    vlenmax = int(settings['reg_width_bits'].value * settings['LMUL_ACC'].value
+                  / settings['ELEN_PARAM'].value )
+    TRMM           = (settings['op'].value == 'trmm')
+    is_complex = settings['complex'].value
+    generate_gemm_kernel_inner = generate_gemm_kernel_inner_complex if is_complex else generate_gemm_kernel_inner_real
+
+    while( M_tail > M_tail_min ):
+        with dest.block("if( M & {M_tail} ) {{", "}}", M_tail=M_tail ):
+            if settings['trace'].value:
+                dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
+            a_regs = max( 1, int(M_tail/vlenmax) )
+            vlen = int(M_tail/a_regs)
+            dest.write("gvl = {VSETVL}({vlen});\n", vlen=vlen)
+
+            generate_gemm_kernel_inner( settings, dest, M_tail, N, vlen, a_regs )
+            dest.write( "m_top += {M_tail};" )
+
+        M_tail = int( M_tail / 2 )
+
+    while( M_tail > 0 ):
+        with dest.block("if( M & {M_tail} ) {{", "}}", 
+                M_tail=M_tail, 
+                N=N, 
+                result_t = ('double' if settings['force_acc_double'].value else settings['param_scalar_t'].value) 
+        ):
+            if settings['trace'].value:
+                dest.write("printf(\"tail: %s(%d) M %d N %d K %d m_top %d n_top %d\\n\", __FILE__, __LINE__, M, N, K, m_top, n_top);")
+            for r in range(M_tail * N * (2 if is_complex else 1)):
+                dest.write("{result_t} result{r} = 0;",
+                    r=r
+                )
+
+            dest.write("{index_type} ai=m_top*K{elt_size};")
+            dest.write("{index_type} bi=n_top*K{elt_size};")
+
+            if TRMM:
+                with dest.map(M=M_tail, N=N):
+                    generate_trmm_block( dest )
+
+            with dest.block("for({index_type} k=0; k<{Kend}; k++) {{", "}}", Kend = ('pass_K' if TRMM else 'K') ):
+                for ki in range( N ):
+                    for kj in range( M_tail ):
+                        if is_complex:
+                            dest.write("result{dest}+=S0*A[ai+{kj}+0]*B[bi+{ki}+0] + S1*A[ai+{kj}+1]*B[bi+{ki}+1];".format(
+                                        dest=(ki*M_tail+kj)*2, kj=kj*2, ki=ki*2
+                                    ))                            
+                            dest.write("result{dest}+=S2*A[ai+{kj}+1]*B[bi+{ki}+0] + S3*A[ai+{kj}+0]*B[bi+{ki}+1];".format(
+                                        dest=(ki*M_tail+kj)*2+1, kj=kj*2, ki=ki*2
+                                    ))                            
+                        else:
+                            dest.write("result{dest}+=A[ai+{kj}]*B[bi+{ki}];".format(
+                                    dest=ki*M_tail+kj, kj=kj, ki=ki
+                                ))
+                dest.write("ai+={M_tail}{elt_size};")
+                dest.write("bi+={N}{elt_size};")
+
+            dest.write("{index_type} ci=n_top*ldc+m_top;")
+            if is_complex:
+                dest.write("{result_t} Cr, Ci;")
+            for ki in range( N ):
+                for kj in range( M_tail ):
+                    if is_complex:
+                        if TRMM:
+                            dest.write('Cr = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
+                            dest.write('Ci = result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
+                        else:
+                            dest.write('Cr = C[(ci+{ki}*ldc+{kj})*2+0];', ki=ki, kj=kj)
+                            dest.write('Ci = C[(ci+{ki}*ldc+{kj})*2+1];', ki=ki, kj=kj)
+                            dest.write('Cr += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+0)
+                            dest.write('Ci += result{dest}*alphar;', dest=(ki*M_tail+kj)*2+1)
+                        dest.write('Cr -= result{dest}*alphai;', dest=(ki*M_tail+kj)*2+1)
+                        dest.write('Ci += result{dest}*alphai;', dest=(ki*M_tail+kj)*2+0)
+                        dest.write("C[(ci+{ki}*ldc+{kj})*2+0] = Cr;", ki=ki, kj=kj )
+                        dest.write("C[(ci+{ki}*ldc+{kj})*2+1] = Ci;", ki=ki, kj=kj )
+                    else:
+                        op = '' if TRMM else '+'
+                        dest.write("C[ci+{ki}*ldc+{kj}] {op}= alpha * result{dest};",
+                                ki=ki, kj=kj, op=op, dest=ki*M_tail+kj
+                            )
+            dest.write("m_top+={M_tail};")
+
+        M_tail = int(M_tail/2)
+
+
+#-----------------------------------------------------------------------
+class Setting(object):
+    def __init__( self, value, convert = None ):
+        self._value = value
+        self._convert = convert
+
+    @classmethod
+    def ENUM( cls, *values ):
+        def closure( values ):
+            return lambda value: values[value.lower()]
+        return closure( { v.lower():v for v in values } )
+
+    @classmethod
+    def BOOL( cls, value ):
+        return value.lower().startswith('t') or value == '1'
+
+    @property
+    def value( self ):
+        return self._value
+
+    @property
+    def configurable( self ):
+        return self._convert is not None
+
+    @value.setter
+    def value( self, value ):
+        self._value = self._convert( value )
+
+    def __str__( self ):
+        return str(self._value)
+
+#-----------------------------------------------------------------------
+def main():
+    settings = {
+        'op':               Setting( 'gemm', Setting.ENUM( 'gemm', 'trmm' ) ),
+        'M':                Setting( 16, int ),
+        'N':                Setting( 4, int ),
+        'reg_width_bits':   Setting( 256, int ),
+        'LMUL':             Setting( 1, int ),
+        'M_tail_scalar_from':Setting( 2, int ),
+        'cpu':              Setting( 'zvl256b', str ),
+        'param_precision':  Setting( 'float', Setting.ENUM( 'float', 'double' ) ),
+        'force_acc_double': Setting( False, Setting.BOOL ),
+        'complex':          Setting( False, Setting.BOOL ),
+        'conjugate':        Setting( False, Setting.BOOL ),
+        'index_type':       Setting( 'BLASLONG', str ),
+        'trace':            Setting( False, Setting.BOOL ),
+        'output':           Setting( None, str ),
+        'tail_policy':      Setting( '', str ), # _ta, if toolchain supports it
+        '__riscv_':         Setting( '__riscv_', str),
+    }
+
+    for item in sys.argv[1:]:
+        try:
+            name, value = tuple(item.split( '=', 1 ))
+        except:
+            ERROR("couldn't parse {}, expected arguments of the form name=value".format(item))
+
+        if name not in settings:
+            ERROR("couldn't parse {}, {} it is not a known option\n".format( item, name )
+                  +"options (and current defaults) are\n{}".format(
+                   " ".join([ '{}={}'.format(k, settings[k].value) for k in settings.keys()]))
+                )
+
+        try:
+            settings[name].value = value
+        except:
+            import traceback
+            traceback.print_exc()
+            ERROR("couldn't parse {}".format(item))
+
+    if settings['output'].value is None:
+        if settings['complex'].value:
+            prefix = 'z' if settings['param_precision'].value == 'double' else 'c'
+        else:
+            prefix = 'd' if settings['param_precision'].value == 'double' else 's'
+        settings['output'] = Setting('{}{}_kernel_{}x{}_{}.c'.format(
+                prefix,
+                settings['op'],
+                settings['M'],
+                settings['N'],
+                settings['cpu']
+            ))
+
+    if settings['param_precision'].value == 'double':
+        settings['param_scalar_t'] = Setting( 'double' )
+        settings['ELEN_PARAM'] = Setting(64)
+    else:
+        settings['param_scalar_t'] = Setting( 'float' )
+        settings['ELEN_PARAM'] = Setting(32)
+
+    settings['VFMUL'] = Setting( '{}vfmul_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
+    settings['VFMACC'] = Setting( '{}vfmacc_vf_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']) )
+
+    settings['ELEN_ACC'] = settings['ELEN_PARAM']
+    settings['LMUL_ACC'] = Setting(settings['LMUL'].value)
+    widen = ''
+
+    if settings['force_acc_double'].value and (settings['param_precision'].value == 'float'):
+        settings['ELEN_ACC'] = Setting(64)
+        settings['LMUL_ACC'] = Setting(settings['LMUL'].value*2)
+        settings['VFNCVT']   = Setting('{}vfncvt_f_f_w_f{}m{}{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL'], settings['tail_policy']))
+        widen = 'w'
+
+    settings['VMUL_TO_ACC'] = Setting( '{}vf{}mul_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
+    settings['VMACC_TO_ACC'] = Setting( '{}vf{}macc_vf_f{}m{}{}'.format(settings['__riscv_'], widen, settings['ELEN_ACC'], settings['LMUL_ACC'], settings['tail_policy']) )
+
+    settings['param_vector_t']=Setting('vfloat{}m{}_t'.format(settings['ELEN_PARAM'], settings['LMUL']))
+    settings['acc_vector_t']  =Setting('vfloat{}m{}_t'.format(settings['ELEN_ACC'], settings['LMUL_ACC']))
+    settings['VLEV']          =Setting('{}vle{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VSEV']          =Setting('{}vse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VLSEV']         =Setting('{}vlse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VSSEV']         =Setting('{}vsse{}_v_f{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['ELEN_PARAM'], settings['LMUL']))
+    settings['VSETVL']        =Setting('{}vsetvl_e{}m{}'.format(settings['__riscv_'], settings['ELEN_PARAM'], settings['LMUL']))
+
+
+    to_stdout = (settings['output'].value == '-')
+    if not to_stdout:
+        print("Writing {}".format(settings['output'].value), file=sys.stderr)
+
+    with open(sys.stdout.fileno() if to_stdout else settings['output'].value, 'w') as destination_file:
+        def OUTPUT(*args, **kwargs):
+            print(*args, file=destination_file, **kwargs)
+
+        OUTPUT("/*\n\nAUTOGENERATED KERNEL\nSettings:\n {}".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if settings[k].configurable])))
+        OUTPUT("Derived:\n {}\n*/\n".format(" ".join([ "{}={}\n".format(k, repr(settings[k].value)) for k in sorted(settings.keys()) if not settings[k].configurable])))
+
+        OUTPUT('#include "common.h"')
+        OUTPUT("\n")
+
+        if settings['op'].value in ('gemm', 'trmm'):
+            generate_gemm_kernel(settings, OUTPUT)
+        else:
+            ERROR("unsupported kernel type {}".format(settings['op']))
+
+if __name__ == "__main__":
+    main()
diff --git a/kernel/riscv64/iamax_rvv.c b/kernel/riscv64/iamax_rvv.c
new file mode 100644
index 000000000..8362d7cef
--- /dev/null
+++ b/kernel/riscv64/iamax_rvv.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int max_index = 0;
+    if (n <= 0 || inc_x <= 0) return(max_index);
+
+    FLOAT_V_T vx, v_max;
+    UINT_V_T v_max_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_max_index = VMVVX_UINT(0, vlmax);
+    v_max = VFMVVF_FLOAT(-1, vlmax);
+    BLASLONG j=0;
+    FLOAT maxf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+
+    v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
+    max_index = VFIRSTM(mask, vlmax);
+    
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
+    max_index = VMVVXS_UINT(v_max_index);
+
+    return(max_index+1);
+}
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
index ecb4cd7a9..800312400 100644
--- a/kernel/riscv64/iamax_vector.c
+++ b/kernel/riscv64/iamax_vector.c
@@ -27,118 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <math.h>
+#include <float.h>
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
-#define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m4)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m4_f64m1)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m4_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m4)
+#endif
+#define MASK_T vbool16_t
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m4_b16)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m4)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m4_b16)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b16)
+#define UINT_V_T vuint64m4_t
+#define VIDV_UINT RISCV_RVV(vid_v_u64m4)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m4)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m4)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m4)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m4_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
-#define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m4_f32m1(v_res, va, vb, gvl)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m4)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m4_f32m1)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m4_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m4)
+#endif
+#define MASK_T vbool8_t
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m4_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m4)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m4_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
+#define UINT_V_T vuint32m4_t
+#define VIDV_UINT RISCV_RVV(vid_v_u32m4)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m4)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m4)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m4)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m4_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT maxf=0.0;
+        BLASLONG i=0, j=0;
         unsigned int max_index = 0;
-	if (n <= 0 || inc_x <= 0) return(max_index);
+        if (n <= 0 || inc_x <= 0) return(max_index);
+        FLOAT maxf=-FLT_MAX;
 
         FLOAT_V_T vx, v_max;
         UINT_V_T v_max_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
+
+        gvl = VSETVL(n);
+        UINT_V_T vid = VIDV_UINT(gvl);
 
         if(inc_x == 1){
-                gvl = VSETVL(n);
                 v_max_index = VMVVX_UINT(0, gvl);
-                v_max = VFMVVF_FLOAT(-1, gvl);
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = *((FLOAT*)&v_res);
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_max = VLEV_FLOAT(&x[j], gvl);
+                        v_max = VFABS_FLOAT(v_max, gvl);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
-                                v_max_index = VIDV_UINT(gvl);
-                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+                                v_max_index = VADDVX_UINT(vid, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -146,51 +151,48 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 unsigned int stride_x = inc_x * sizeof(FLOAT);
                 unsigned int idx = 0, inc_v = gvl * inc_x;
 
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
                 v_max_index = VMVVX_UINT(0, gvl);
-                v_max = VFMVVF_FLOAT(-1, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
-                        v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, vid, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                maxf = *((FLOAT*)&v_res);
+
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_max = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        v_max = VFABS_FLOAT(v_max, gvl);
+
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
                         if(cur_maxf > maxf){
                                 //tail index
-                                v_max_index = VIDV_UINT(gvl);
-                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+                                v_max_index = VADDVX_UINT(vid, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }
-	return(max_index+1);
+        return(max_index+1);
 }
-
-
diff --git a/kernel/riscv64/iamin_rvv.c b/kernel/riscv64/iamin_rvv.c
new file mode 100644
index 000000000..f90dbb545
--- /dev/null
+++ b/kernel/riscv64/iamin_rvv.c
@@ -0,0 +1,150 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int min_index = 0;
+    if (n <= 0 || inc_x <= 0) return(min_index);
+
+    FLOAT_V_T vx, v_min;
+    UINT_V_T v_min_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_min_index = VMVVX_UINT(0, vlmax);
+    v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT minf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vx = VFABSV_FLOAT(vx, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+
+    v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
+    min_index = VFIRSTM(mask, vlmax);
+
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
+    min_index = VMVVXS_UINT(v_min_index);
+
+    return(min_index+1);
+}
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
index c72bb94cc..0e591e697 100644
--- a/kernel/riscv64/iamin_vector.c
+++ b/kernel/riscv64/iamin_vector.c
@@ -31,85 +31,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
-#define UINT_V_T vuint64m8_t
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl)
 #define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
 #define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f64m8_f64m1
+#define VIDV_MASK_UINT __riscv_vid_v_u64m8_mu
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u64m8_mu
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
+#define MASK_T vbool8_t
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
+#define UINT_V_T vuint64m8_t
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
-#define UINT_V_T vuint32m8_t
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl)
 #define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
 #define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
+#define VIDV_MASK_UINT __riscv_vid_v_u32m8_mu
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_mu
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
+#define MASK_T vbool4_t
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
+#define UINT_V_T vuint32m8_t
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
         unsigned int min_index = 0;
-	if (n <= 0 || inc_x <= 0) return(min_index);
+        if (n <= 0 || inc_x <= 0) return(min_index);
+        FLOAT minf=FLT_MAX;
 
         FLOAT_V_T vx, v_min;
         UINT_V_T v_min_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
-                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 v_min_index = VMVVX_UINT(0, gvl);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
                         v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
                         v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
@@ -117,29 +125,29 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_min = VLEV_FLOAT(&x[j], gvl);
+                        v_min = VFABS_FLOAT(v_min, gvl);
 
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
                         if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
 
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -151,12 +159,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 v_min_index = VMVVX_UINT(0, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        vx = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        vx = VFABS_FLOAT(vx, gvl);
 
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
                         v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
                         v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
@@ -165,33 +171,31 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
-                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
-                        v_min = VFRSUBVF_MASK_FLOAT(mask, vx, vx, 0, gvl);
+                        v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        v_min = VFABS_FLOAT(v_min, gvl);
 
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
                         if(cur_minf < minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
 
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }
-	return(min_index+1);
+        return(min_index+1);
 }
-
-
diff --git a/kernel/riscv64/imax_rvv.c b/kernel/riscv64/imax_rvv.c
new file mode 100644
index 000000000..b1a77b178
--- /dev/null
+++ b/kernel/riscv64/imax_rvv.c
@@ -0,0 +1,146 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int max_index = 0;
+    if (n <= 0 || inc_x <= 0) return(max_index);
+
+    FLOAT_V_T vx, v_max;
+    UINT_V_T v_max_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_max_index = VMVVX_UINT(0, vlmax);
+    v_max = VFMVVF_FLOAT(-FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT maxf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(-FLT_MAX, vlmax);
+
+    v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
+    max_index = VFIRSTM(mask, vlmax);
+    
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
+    max_index = VMVVXS_UINT(v_max_index);
+
+    return(max_index+1);
+}
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
index c2d787ab8..308fa15a4 100644
--- a/kernel/riscv64/imax_vector.c
+++ b/kernel/riscv64/imax_vector.c
@@ -31,68 +31,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f64m8_f64m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) vfredmax_vs_f32m8_f32m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
+        BLASLONG i=0, j=0;
         unsigned int max_index = 0;
-	if (n <= 0 || inc_x <= 0) return(max_index);
-	FLOAT maxf=-FLT_MAX;
+        if (n <= 0 || inc_x <= 0) return(max_index);
+        FLOAT maxf=-FLT_MAX;
 
         FLOAT_V_T vx, v_max;
         UINT_V_T v_max_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_min;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
@@ -104,32 +116,34 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
                         v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                maxf = *((FLOAT*)&v_res);
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_max = VLEV_FLOAT(&x[j], gvl);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
                                 v_max_index = VADDVX_UINT(v_max_index, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -145,37 +159,37 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         //index where element greater than v_max
                         mask = VMFLTVV_FLOAT(v_max, vx, gvl);
                         v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
-                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j,gvl);
+                        v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                         //update v_max and start_index j
                         v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                maxf = *((FLOAT*)&v_res);
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                maxf = EXTRACT_FLOAT(v_res);
                 mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
-                max_index = VMFIRSTM(mask,gvl);
-                max_index = *((unsigned int*)&v_max_index+max_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                max_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        FLOAT cur_maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                        FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
                         if(cur_maxf > maxf){
                                 //tail index
                                 v_max_index = VIDV_UINT(gvl);
                                 v_max_index = VADDVX_UINT(v_max_index, j, gvl);
 
                                 mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                                max_index = VMFIRSTM(mask,gvl);
-                                max_index = *((unsigned int*)&v_max_index+max_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_max_index, mask, gvl);
+                                max_index = VMV_X(compressed);
                         }
                 }
         }
-	return(max_index+1);
+        return(max_index+1);
 }
-
-
diff --git a/kernel/riscv64/imin_rvv.c b/kernel/riscv64/imin_rvv.c
new file mode 100644
index 000000000..1de7f3233
--- /dev/null
+++ b/kernel/riscv64/imin_rvv.c
@@ -0,0 +1,146 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m8_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m8_b8
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m8_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint64m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m8_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define MASK_T                  vbool4_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m8_b4
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m8_b4
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m8_b4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
+#define VFIRSTM                 __riscv_vfirst_m_b4
+#define UINT_V_T                vuint32m8_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m8_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m8
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m8_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m8
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m8
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m8
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m8_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int min_index = 0;
+    if (n <= 0 || inc_x <= 0) return(min_index);
+
+    FLOAT_V_T vx, v_min;
+    UINT_V_T v_min_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_min_index = VMVVX_UINT(0, vlmax);
+    v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT minf=0.0;
+    
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
+        }
+
+    } else {
+  
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, j += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx, vl);
+        }
+  
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+
+    v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
+    min_index = VFIRSTM(mask, vlmax);
+
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
+    min_index = VMVVXS_UINT(v_min_index);
+
+    return(min_index+1);
+}
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
index dfe9a3310..ec36b8eb9 100644
--- a/kernel/riscv64/imin_vector.c
+++ b/kernel/riscv64/imin_vector.c
@@ -31,122 +31,119 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define ABS fabs
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f64m8_f64m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u64m8_m)(mask, v_min_index, gvl)
+#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u64m8_m)(mask, a, a, b, gvl)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) vfredmin_vs_f32m8_f32m1(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT(mask, gvl) RISCV_RVV(vid_v_u32m8_m)(mask, v_min_index, gvl)
+#define VADDVX_MASK_UINT(mask, a, b, gvl) RISCV_RVV(vadd_vx_u32m8_m)(mask, a, a, b, gvl)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT __riscv_vfredmin_vs_f32m8_f32m1
+#define VIDV_MASK_UINT __riscv_vid_v_u32m8_m
+#define VADDVX_MASK_UINT __riscv_vadd_vx_u32m8_m
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
         unsigned int min_index = 0;
-	if (n <= 0 || inc_x <= 0) return(min_index);
+        if (n <= 0 || inc_x <= 0) return(min_index);
+        FLOAT minf=FLT_MAX;
 
         FLOAT_V_T vx, v_min;
         UINT_V_T v_min_index;
         MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
-                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 v_min_index = VMVVX_UINT(0, gvl);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#endif
-*/
-                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
+
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_min = VLEV_FLOAT(&x[j], gvl);
 
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
-                        if(cur_minf < minf){
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
+                        if(cur_minf > minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }else{
@@ -159,59 +156,39 @@ asm volatile(
                 for(i=0,j=0; i < n/gvl; i++){
                         vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
 
-                        //index where element less than v_min
-                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
-                        v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask), "r"(gvl)
-        :"v0");
-#endif
-*/
-
-                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j,gvl);
+                        //index where element greater than v_min
+                        mask = VMFGTVV_FLOAT(v_min, vx, gvl);
+                        v_min_index = VIDV_MASK_UINT(mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(mask, v_min_index, j, gvl);
 
                         //update v_min and start_index j
                         v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                         j += gvl;
                         idx += inc_v;
                 }
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                minf = *((FLOAT*)&v_res);
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                minf = EXTRACT_FLOAT(v_res);
                 mask = VMFLEVF_FLOAT(v_min, minf, gvl);
-                min_index = VMFIRSTM(mask,gvl);
-                min_index = *((unsigned int*)&v_min_index+min_index);
+                UINT_V_T compressed;
+                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                min_index = VMV_X(compressed);
 
                 if(j < n){
                         gvl = VSETVL(n-j);
                         v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        FLOAT cur_minf = *((FLOAT*)&v_res);
-                        if(cur_minf < minf){
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                        FLOAT cur_minf = EXTRACT_FLOAT(v_res);
+                        if(cur_minf > minf){
                                 //tail index
                                 v_min_index = VIDV_UINT(gvl);
                                 v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
                                 mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                                min_index = VMFIRSTM(mask,gvl);
-                                min_index = *((unsigned int*)&v_min_index+min_index);
+                                UINT_V_T compressed;
+                                compressed = VCOMPRESS(v_min_index, mask, gvl);
+                                min_index = VMV_X(compressed);
                         }
                 }
         }
-	return(min_index+1);
+        return(min_index+1);
 }
-
-
diff --git a/kernel/riscv64/izamax_rvv.c b/kernel/riscv64/izamax_rvv.c
new file mode 100644
index 000000000..32f66a7a7
--- /dev/null
+++ b/kernel/riscv64/izamax_rvv.c
@@ -0,0 +1,172 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
+#define MASK_T                  vbool16_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m4_b16
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m4_b16
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f64m4_b16
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFIRSTM                 __riscv_vfirst_m_b16
+#define UINT_V_T                vuint64m4_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m4_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m4
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m4_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m4_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m4_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m4_b8
+#define VMFGEVF_FLOAT           __riscv_vmfge_vf_f32m4_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint32m4_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m4_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m4
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m4_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m4_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int max_index = 0;
+    if (n <= 0 || inc_x <= 0) return(max_index);
+
+    FLOAT_V_T vx0, vx1, v_max;
+    FLOAT_VX2_T vxx2;
+    UINT_V_T v_max_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_max_index = VMVVX_UINT(0, vlmax);
+    v_max = VFMVVF_FLOAT(-1, vlmax);
+    BLASLONG j=0;
+    FLOAT maxf=0.0;
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx0, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl);
+        }
+    }
+    else {
+        
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
+            vl = VSETVL(n);
+        
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+        
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+        
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+        
+            //index where element greater than v_max
+            mask = VMFLTVV_FLOAT(v_max, vx0, vl);
+            v_max_index = VIDV_MASK_UINT_TU(mask, v_max_index, vl);
+            v_max_index = VADDVX_MASK_UINT_TU(mask, v_max_index, v_max_index, j, vl);
+
+            //update v_max and start_index j
+            v_max = VFMAXVV_FLOAT_TU(v_max, v_max, vx0, vl);
+        }
+
+    }
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+
+    v_res = VFREDMAXVS_FLOAT(v_max, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFGEVF_FLOAT(v_max, maxf, vlmax);
+    max_index = VFIRSTM(mask, vlmax);
+    
+    v_max_index = VSLIDEDOWN_UINT(v_max_index, max_index, vlmax);
+    max_index = VMVVXS_UINT(v_max_index);
+
+    return(max_index+1);
+}
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
index fdbdc3ae8..d33e89c00 100644
--- a/kernel/riscv64/izamax_vector.c
+++ b/kernel/riscv64/izamax_vector.c
@@ -27,241 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <math.h>
+#include <float.h>
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f64m8_f64m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VMFGEVF_FLOAT vmfge_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f64m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
-#define VSEVU_UINT vse64_v_u64m8
+#define VSEVU_UINT RISCV_RVV(vse64_v_u64m8)
 #define UINT_T long unsigned int
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmax_vs_f32m8_f32m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMAXVS_FLOAT RISCV_RVV(vfredmax_vs_f32m8_f32m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VMFGEVF_FLOAT vmfge_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFLTVV_FLOAT RISCV_RVV(vmflt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMAXVV_FLOAT RISCV_RVV(vfmax_vv_f32m8)
+#define VMFGEVF_FLOAT RISCV_RVV(vmfge_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
 #define UINT_T unsigned int
-#define VSEVU_UINT vse32_v_u32m8
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8)
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
-#define RVV_M RVV_M8
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT maxf=0.0;
+        BLASLONG i=0, j=0;
         unsigned int max_index = 0;
-	if (n <= 0 || inc_x <= 0) return(max_index);
+        if (n <= 0 || inc_x <= 0) return(max_index);
+        FLOAT maxf=-FLT_MAX;
 
-        FLOAT_V_T vx0, vx1, v_max;
+        FLOAT_V_T vx, vx2, v_max;
         UINT_V_T v_max_index;
-        MASK_T mask0, mask1;
+        MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
 
         gvl = VSETVL(n);
-                UINT_T temp_uint[gvl];
+        unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
+        unsigned int idx = 0, inc_v = gvl * inc_x * 2;
+
+        v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
         v_max_index = VMVVX_UINT(0, gvl);
-        v_max = VFMVVF_FLOAT(-1, gvl);
-        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-        BLASLONG inc_xv = gvl * inc_x * 2;
-        BLASLONG ix = 0;
         for(i=0,j=0; i < n/gvl; i++){
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                vx = VFABS_FLOAT(vx, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                vx = VFADDVV_FLOAT(vx, vx2, gvl);
+
 
                 //index where element greater than v_max
-                mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
-                v_max_index = VIDV_MASK_UINT(mask0, v_max_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_max_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_max_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_max_index = VADDVX_MASK_UINT(mask0, v_max_index, v_max_index, j, gvl);
+                mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                v_max_index = VIDV_MASK_UINT(mask, v_max_index, gvl);
+                v_max_index = VADDVX_MASK_UINT(mask, v_max_index, v_max_index, j, gvl);
 
                 //update v_max and start_index j
-                v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
+                v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
                 j += gvl;
-                ix += inc_xv;
+                idx += inc_v;
         }
-        vx0 = VFMVVF_FLOAT(0, gvl);
-        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-        maxf = VFMVFS_FLOAT(v_res);
-        mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
-        max_index = VMFIRSTM(mask0,gvl);
-        VSEVU_UINT(temp_uint,v_max_index,gvl);
-        max_index = temp_uint[max_index];
-
+        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+        maxf = EXTRACT_FLOAT(v_res);
+        mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+        UINT_V_T compressed;
+        compressed = VCOMPRESS(v_max_index, mask, gvl);
+        max_index = VMV_X(compressed);
 
         if(j < n){
                 gvl = VSETVL(n-j);
-                v_max_index = VMVVX_UINT(0, gvl);
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
-                v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-                FLOAT cur_maxf = VFMVFS_FLOAT(v_res);
+                v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                v_max = VFABS_FLOAT(v_max, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                v_max = VFADDVV_FLOAT(v_max, vx2, gvl);
+
+                v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
+                FLOAT cur_maxf = EXTRACT_FLOAT(v_res);
+
                 if(cur_maxf > maxf){
                         //tail index
                         v_max_index = VIDV_UINT(gvl);
                         v_max_index = VADDVX_UINT(v_max_index, j, gvl);
 
-                        mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
-                        max_index = VMFIRSTM(mask0,gvl);
-                        VSEVU_UINT(temp_uint,v_max_index,gvl);
-                                         max_index = temp_uint[max_index];
-
+                        mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                        UINT_V_T compressed;
+                        compressed = VCOMPRESS(v_max_index, mask, gvl);
+                        max_index = VMV_X(compressed);
                 }
         }
-	return(max_index+1);
-}
-
 
+        return(max_index+1);
+}
diff --git a/kernel/riscv64/izamin_rvv.c b/kernel/riscv64/izamin_rvv.c
new file mode 100644
index 000000000..d34b220fa
--- /dev/null
+++ b/kernel/riscv64/izamin_rvv.c
@@ -0,0 +1,171 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m4_f64m1
+#define MASK_T                  vbool16_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f64m4_b16
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f64m4_b16
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f64m4_b16
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFIRSTM                 __riscv_vfirst_m_b16
+#define UINT_V_T                vuint64m4_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u64m4_tumu
+#define VIDV_UINT               __riscv_vid_v_u64m4
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u64m4_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u64m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u64m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u64m4_u64
+#else
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m4_f32m1
+#define MASK_T                  vbool8_t
+#define VMFLTVF_FLOAT           __riscv_vmflt_vf_f32m4_b8
+#define VMFLTVV_FLOAT           __riscv_vmflt_vv_f32m4_b8
+#define VMFLEVF_FLOAT           __riscv_vmfle_vf_f32m4_b8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFIRSTM                 __riscv_vfirst_m_b8
+#define UINT_V_T                vuint32m4_t
+#define VIDV_MASK_UINT_TU       __riscv_vid_v_u32m4_tumu
+#define VIDV_UINT               __riscv_vid_v_u32m4
+#define VADDVX_MASK_UINT_TU     __riscv_vadd_vx_u32m4_tumu
+#define VADDVX_UINT             __riscv_vadd_vx_u32m4
+#define VMVVX_UINT              __riscv_vmv_v_x_u32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VSLIDEDOWN_UINT         __riscv_vslidedown_vx_u32m4
+#define VMVVXS_UINT             __riscv_vmv_x_s_u32m4_u32
+#endif
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    unsigned int min_index = 0;
+    if (n <= 0 || inc_x <= 0) return(min_index);
+
+    FLOAT_V_T vx0, vx1, v_min;
+    FLOAT_VX2_T vxx2;
+    UINT_V_T v_min_index;
+    MASK_T mask;
+  
+    size_t vlmax = VSETVL_MAX;
+    v_min_index = VMVVX_UINT(0, vlmax);
+    v_min = VFMVVF_FLOAT(FLT_MAX, vlmax);
+    BLASLONG j=0;
+    FLOAT minf=0.0;
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, j += vl) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx0, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, j += vl) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+
+            vx0 = VFABSV_FLOAT(vx0, vl);
+            vx1 = VFABSV_FLOAT(vx1, vl);
+
+            vx0 = VFADDVV_FLOAT(vx0, vx1, vl);
+
+            // index where element less than v_min
+            mask = VMFLTVV_FLOAT(vx0, v_min, vl);
+            v_min_index = VIDV_MASK_UINT_TU(mask, v_min_index, vl);
+            v_min_index = VADDVX_MASK_UINT_TU(mask, v_min_index, v_min_index, j, vl);
+
+            //update v_min and start_index j
+            v_min = VFMINVV_FLOAT_TU(v_min, v_min, vx0, vl);
+        }
+
+    }
+
+    FLOAT_V_T_M1 v_res;
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, vlmax);
+
+    v_res = VFREDMINVS_FLOAT(v_min, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+    mask = VMFLEVF_FLOAT(v_min, minf, vlmax);
+    min_index = VFIRSTM(mask, vlmax);
+
+    v_min_index = VSLIDEDOWN_UINT(v_min_index, min_index, vlmax);
+    min_index = VMVVXS_UINT(v_min_index);
+
+    return(min_index+1);
+}
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
index 59c720310..c76a38099 100644
--- a/kernel/riscv64/izamin_vector.c
+++ b/kernel/riscv64/izamin_vector.c
@@ -31,235 +31,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #if defined(DOUBLE)
 
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f64m8_f64m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u64m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f64m8_f64m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u64m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u64m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u64m8)
+#endif
 #define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VMFLTVV_FLOAT vmflt_vv_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VMFLEVF_FLOAT vmfle_vf_f64m8_b8
-#define VMFIRSTM vmfirst_m_b8
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f64m8_b8)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f64m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f64m8_b8)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b8)
 #define UINT_V_T vuint64m8_t
 #define VSEVU_UINT vse64_v_u64m8
 #define UINT_T long unsigned int
-#define VIDV_MASK_UINT vid_v_u64m8_m
-#define VIDV_UINT vid_v_u64m8
-#define VADDVX_MASK_UINT vadd_vx_u64m8_m
-#define VADDVX_UINT vadd_vx_u64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
-#define VMVVX_UINT vmv_v_x_u64m8
+#define VIDV_UINT RISCV_RVV(vid_v_u64m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u64m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u64m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f64m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u64m8_u64)
 #else
 
-#define ABS fabsf
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) RISCV_RVV(vfredmin_vs_f32m8_f32m1)(v_res, va, vb, gvl)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_m)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_m)
+#define VCOMPRESS(va, vm, gvl) RISCV_RVV(vcompress_vm_u32m8)(vm, compressed, va, gvl)
+#else
+#define VFREDMINVS_FLOAT RISCV_RVV(vfredmin_vs_f32m8_f32m1)
+#define VIDV_MASK_UINT RISCV_RVV(vid_v_u32m8_mu)
+#define VADDVX_MASK_UINT RISCV_RVV(vadd_vx_u32m8_mu)
+#define VCOMPRESS RISCV_RVV(vcompress_vm_u32m8)
+#endif
 #define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VMFLTVV_FLOAT vmflt_vv_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VMFLEVF_FLOAT vmfle_vf_f32m8_b4
-#define VMFIRSTM vmfirst_m_b4
+#define VMFGTVV_FLOAT RISCV_RVV(vmfgt_vv_f32m8_b4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMINVV_FLOAT RISCV_RVV(vfmin_vv_f32m8)
+#define VMFLEVF_FLOAT RISCV_RVV(vmfle_vf_f32m8_b4)
+#define VMFIRSTM RISCV_RVV(vfirst_m_b4)
 #define UINT_V_T vuint32m8_t
 #define UINT_T unsigned int
-#define VSEVU_UINT vse32_v_u32m8
-#define VIDV_MASK_UINT vid_v_u32m8_m
-#define VIDV_UINT vid_v_u32m8
-#define VADDVX_MASK_UINT vadd_vx_u32m8_m
-#define VADDVX_UINT vadd_vx_u32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-#define VMVVX_UINT vmv_v_x_u32m8
+#define VSEVU_UINT RISCV_RVV(vse32_v_u32m8)
+#define VIDV_UINT RISCV_RVV(vid_v_u32m8)
+#define VADDVX_UINT RISCV_RVV(vadd_vx_u32m8)
+#define VMVVX_UINT RISCV_RVV(vmv_v_x_u32m8)
+#define VFABS_FLOAT RISCV_RVV(vfabs_v_f32m8)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
+#define VMV_X RISCV_RVV(vmv_x_s_u32m8_u32)
 #endif
 
 
 BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
-	FLOAT minf=FLT_MAX;
+        BLASLONG i=0, j=0;
         unsigned int min_index = 0;
-	if (n <= 0 || inc_x <= 0) return(min_index);
+        if (n <= 0 || inc_x <= 0) return(min_index);
+        FLOAT minf=FLT_MAX;
 
-        FLOAT_V_T vx0, vx1, v_min;
+        FLOAT_V_T vx, vx2, v_min;
         UINT_V_T v_min_index;
-        MASK_T mask0, mask1;
+        MASK_T mask;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         gvl = VSETVL(n);
-		UINT_T temp_uint[gvl];
-        v_min_index = VMVVX_UINT(0, gvl);
+        unsigned int stride_x = inc_x * 2 * sizeof(FLOAT);
+        unsigned int idx = 0, inc_v = gvl * inc_x * 2;
+
         v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
-        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-        BLASLONG inc_xv = gvl * inc_x * 2;
-        BLASLONG ix = 0;
+        v_min_index = VMVVX_UINT(0, gvl);
         for(i=0,j=0; i < n/gvl; i++){
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                vx = VFABS_FLOAT(vx, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                vx = VFADDVV_FLOAT(vx, vx2, gvl);
 
-                //index where element less than v_min
-                mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
-                v_min_index = VIDV_MASK_UINT(mask0, v_min_index, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e64,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv v0, %1, %1 \n\t"
-        "vsetvli x0, %2, e32,m8 \n\t"
-        "vid.v %0, v0.t \n\t"
-        :"+v"(v_min_index)
-        :"v"(mask0), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_min_index = VADDVX_MASK_UINT(mask0, v_min_index, v_min_index, j, gvl);
+
+                //index where element greater than v_min
+                mask = VMFGTVV_FLOAT(v_min, vx, gvl);
+                v_min_index = VIDV_MASK_UINT(mask, v_min_index, gvl);
+                v_min_index = VADDVX_MASK_UINT(mask, v_min_index, v_min_index, j, gvl);
 
                 //update v_min and start_index j
-                v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
+                v_min = VFMINVV_FLOAT(v_min, vx, gvl);
                 j += gvl;
-                ix += inc_xv;
+                idx += inc_v;
         }
-        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-        minf = VFMVFS_FLOAT(v_res);
-        mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
-        min_index = VMFIRSTM(mask0,gvl);
-                 VSEVU_UINT(temp_uint,v_min_index,gvl);
-        min_index = temp_uint[min_index];
+
+        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+        minf = EXTRACT_FLOAT(v_res);
+        mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+        UINT_V_T compressed;
+        compressed = VCOMPRESS(v_min_index, mask, gvl);
+        min_index = VMV_X(compressed);
 
         if(j < n){
                 gvl = VSETVL(n-j);
-                v_min_index = VMVVX_UINT(0, gvl);
-                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                //fabs(vector)
-                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
-                vx0 = VFRSUBVF_MASK_FLOAT(mask0, vx0, vx0, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx0)
-        :"v"(mask0), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                //fabs(vector)
-                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
-                vx1 = VFRSUBVF_MASK_FLOAT(mask1, vx1, vx1, 0, gvl);
-/*
-#if defined(DOUBLE)
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e64,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#else
-asm volatile(
-        "vor.vv     v0, %1, %1\n\t"
-        "vsetvli    x0, %3, e32,m8 \n\t"
-        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
-        :"+v"(vx1)
-        :"v"(mask1), "f"(zero), "r"(gvl)
-        :"v0");
-#endif
-*/
-                v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
-                v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                FLOAT cur_minf = VFMVFS_FLOAT(v_res);
+                v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                vx2 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                v_min = VFABS_FLOAT(v_min, gvl);
+                vx2 = VFABS_FLOAT(vx2, gvl);
+                v_min = VFADDVV_FLOAT(v_min, vx2, gvl);
+
+                v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
+                FLOAT cur_minf = EXTRACT_FLOAT(v_res);
                 if(cur_minf < minf){
                         //tail index
                         v_min_index = VIDV_UINT(gvl);
                         v_min_index = VADDVX_UINT(v_min_index, j, gvl);
 
-                        mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
-                        min_index = VMFIRSTM(mask0,gvl);
-                              VSEVU_UINT(temp_uint,v_min_index,gvl);
-                                       min_index = temp_uint[min_index];
-
+                        mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                        UINT_V_T compressed;
+                        compressed = VCOMPRESS(v_min_index, mask, gvl);
+                        min_index = VMV_X(compressed);
                 }
         }
-	return(min_index+1);
-}
-
 
+        return(min_index+1);
+}
diff --git a/kernel/riscv64/max_rvv.c b/kernel/riscv64/max_rvv.c
new file mode 100644
index 000000000..745c27bf4
--- /dev/null
+++ b/kernel/riscv64/max_rvv.c
@@ -0,0 +1,98 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m8_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m8_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT maxf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(maxf);
+
+    FLOAT_V_T vx, vmax;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(-FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmax = VFMVVF_FLOAT(-FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
+       }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+
+    return(maxf);
+}
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
index b988513c9..ee9920cd2 100644
--- a/kernel/riscv64/max_vector.c
+++ b/kernel/riscv64/max_vector.c
@@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 #include <float.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
+#else
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
+#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
 #endif
+#define MASK_T          JOIN(vbool,     MLEN,   _t,     _,      _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMAXVV_FLOAT   JOIN(RISCV_RVV(vfmax),     _vv_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT maxf=-FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_max;
-        FLOAT_V_T_M1 v_res, v_min;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_min = VFMVVF_FLOAT_M1(-FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(-FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
@@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 j += gvl * 2;
                                 idx += inc_xv * 2;
                         }
-                        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_min, gvl);
-                        maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        v_res = VFREDMAXVS_FLOAT(v_res, v0, v_min, gvl);
-                        if(*((FLOAT*)&v_res) > maxf)
-                                maxf = *((FLOAT*)&v_res);
+                        v_res = VFREDMAXVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+        maxf = EXTRACT_FLOAT(v_res);
 	return(maxf);
 }
 
diff --git a/kernel/riscv64/min_rvv.c b/kernel/riscv64/min_rvv.c
new file mode 100644
index 000000000..78528fef9
--- /dev/null
+++ b/kernel/riscv64/min_rvv.c
@@ -0,0 +1,98 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m8_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m8_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT minf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(minf);
+
+    FLOAT_V_T vx, vmin;
+    FLOAT_V_T_M1 v_res;
+    
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+
+    return(minf);
+}
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
index be0803df6..2001840bb 100644
--- a/kernel/riscv64/min_vector.c
+++ b/kernel/riscv64/min_vector.c
@@ -28,29 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 #include <float.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMINVV_FLOAT vfmin_vv_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
+#else
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,    ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,    ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMINVV_FLOAT vfmin_vv_f64m8
+#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
 #endif
+#define MASK_T          JOIN(vbool,     MLEN,   _t,     _,      _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt_vf_f), ELEN,  LMUL,   _b,     MLEN)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMINVV_FLOAT   JOIN(RISCV_RVV(vfmin),     _vv_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -59,10 +77,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_min;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         if(inc_x == 1){
                 gvl = VSETVL(n);
@@ -76,15 +92,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 v_min = VFMINVV_FLOAT(v_min, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -102,18 +115,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                                 j += gvl * 2;
                                 idx += inc_xv * 2;
                         }
-                        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-                        minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
                 }
                 for(;j<n;){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
-                        v_res = VFREDMINVS_FLOAT(v_res, v0, v_max, gvl);
-                        if(*((FLOAT*)&v_res) < minf)
-                                minf = *((FLOAT*)&v_res);
+                        v_res = VFREDMINVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }
+        minf = EXTRACT_FLOAT(v_res);
 	return(minf);
 }
 
diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c
index fcff09337..8cc189fe3 100644
--- a/kernel/riscv64/nrm2.c
+++ b/kernel/riscv64/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 
 	n *= inc_x;
diff --git a/kernel/riscv64/nrm2_rvv.c b/kernel/riscv64/nrm2_rvv.c
new file mode 100644
index 000000000..3eb423849
--- /dev/null
+++ b/kernel/riscv64/nrm2_rvv.c
@@ -0,0 +1,212 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if defined(DOUBLE)
+#define VSETVL             __riscv_vsetvl_e64m4
+#define FLOAT_V_T           vfloat64m4_t
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define VLEV_FLOAT          __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
+#define VFMVSF_FLOAT        __riscv_vfmv_s_f_f64m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
+#define MASK_T              vbool16_t
+#define VFABS               __riscv_vfabs_v_f64m4
+#define VMFNE               __riscv_vmfne_vf_f64m4_b16
+#define VMFGT               __riscv_vmfgt_vv_f64m4_b16
+#define VMFEQ               __riscv_vmfeq_vf_f64m4_b16
+#define VCPOP               __riscv_vcpop_m_b16
+#define VFREDMAX            __riscv_vfredmax_vs_f64m4_f64m1
+#define VFREDMIN            __riscv_vfredmin_vs_f64m4_f64m1
+#define VFIRST              __riscv_vfirst_m_b16
+#define VRGATHER            __riscv_vrgather_vx_f64m4
+#define VFDIV               __riscv_vfdiv_vv_f64m4
+#define VFDIV_M             __riscv_vfdiv_vv_f64m4_mu
+#define VFMUL               __riscv_vfmul_vv_f64m4
+#define VFMUL_M             __riscv_vfmul_vv_f64m4_mu
+#define VFMACC              __riscv_vfmacc_vv_f64m4
+#define VFMACC_M            __riscv_vfmacc_vv_f64m4_mu
+#define VMSBF               __riscv_vmsbf_m_b16
+#define VMSOF               __riscv_vmsof_m_b16
+#define VMAND               __riscv_vmand_mm_b16
+#define VMANDN              __riscv_vmand_mm_b16
+#define VFREDSUM            __riscv_vfredusum_vs_f64m4_f64m1
+#define VMERGE              __riscv_vmerge_vvm_f64m4
+#define VSEV_FLOAT          __riscv_vse64_v_f64m4
+#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f64m4_f64(v)
+#define ABS fabs
+#else
+#define VSETVL              __riscv_vsetvl_e32m4
+#define FLOAT_V_T           vfloat32m4_t
+#define FLOAT_V_T_M1        vfloat32m1_t
+#define VLEV_FLOAT          __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT         __riscv_vlse32_v_f32m4
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f32m4
+#define VFMVSF_FLOAT        __riscv_vfmv_s_f_f32m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f32m1
+#define MASK_T              vbool8_t
+#define VFABS               __riscv_vfabs_v_f32m4
+#define VMFNE               __riscv_vmfne_vf_f32m4_b8
+#define VMFGT               __riscv_vmfgt_vv_f32m4_b8
+#define VMFEQ               __riscv_vmfeq_vf_f32m4_b8
+#define VCPOP               __riscv_vcpop_m_b8
+#define VFREDMAX            __riscv_vfredmax_vs_f32m4_f32m1
+#define VFREDMIN            __riscv_vfredmin_vs_f32m4_f32m1
+#define VFIRST              __riscv_vfirst_m_b8
+#define VRGATHER            __riscv_vrgather_vx_f32m4
+#define VFDIV               __riscv_vfdiv_vv_f32m4
+#define VFDIV_M             __riscv_vfdiv_vv_f32m4_mu
+#define VFMUL               __riscv_vfmul_vv_f32m4
+#define VFMUL_M             __riscv_vfmul_vv_f32m4_mu
+#define VFMACC              __riscv_vfmacc_vv_f32m4
+#define VFMACC_M            __riscv_vfmacc_vv_f32m4_mu
+#define VMSBF               __riscv_vmsbf_m_b8
+#define VMSOF               __riscv_vmsof_m_b8
+#define VMAND               __riscv_vmand_mm_b8
+#define VMANDN              __riscv_vmand_mm_b8
+#define VFREDSUM            __riscv_vfredusum_vs_f32m4_f32m1
+#define VMERGE              __riscv_vmerge_vvm_f32m4
+#define VSEV_FLOAT          __riscv_vse32_v_f32m4
+#define EXTRACT_FLOAT0_V(v) __riscv_vfmv_f_s_f32m4_f32(v)
+#define ABS fabsf
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+        if(n == 1) return (ABS(x[0]));
+
+        unsigned int gvl = 0;
+
+        MASK_T nonzero_mask;
+        MASK_T scale_mask;
+
+        gvl = VSETVL(n);
+        FLOAT_V_T v0;
+        FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
+        FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
+
+        FLOAT scale = 0;
+        FLOAT ssq = 0;
+        unsigned int stride_x = inc_x * sizeof(FLOAT);
+        int idx = 0;
+
+        if( n >= gvl ) // don't pay overheads if we're not doing useful work
+        {
+                for(i=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
+                        nonzero_mask = VMFNE( v0, 0, gvl );
+                        v0 = VFABS( v0, gvl );
+                        scale_mask = VMFGT( v0, v_scale, gvl );
+
+                        // assume scale changes are relatively infrequent
+
+                        // unclear if the vcpop+branch is actually a win
+                        // since the operations being skipped are predicated anyway
+                        // need profiling to confirm
+                        if( VCPOP(scale_mask, gvl) ) 
+                        {
+                                v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
+                                v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
+                                v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
+                                v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
+                        }
+                        v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
+                        v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
+                        idx += inc_x * gvl;
+                }
+
+                // we have gvl elements which we accumulated independently, with independent scales
+                // we need to combine these
+                // naive sort so we process small values first to avoid losing information
+                // could use vector sort extensions where available, but we're dealing with gvl elts at most
+
+                FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
+                FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
+                VSEV_FLOAT( out_ssq, v_ssq, gvl );
+                VSEV_FLOAT( out_scale, v_scale, gvl );
+                for( int a = 0; a < (gvl-1); ++a )
+                {
+                        int smallest = a;
+                        for( size_t b = a+1; b < gvl; ++b )
+                                if( out_scale[b] < out_scale[smallest] )
+                                        smallest = b;
+                        if( smallest != a )
+                        {
+                                FLOAT tmp1 = out_ssq[a];
+                                FLOAT tmp2 = out_scale[a];
+                                out_ssq[a] = out_ssq[smallest];
+                                out_scale[a] = out_scale[smallest];
+                                out_ssq[smallest] = tmp1;
+                                out_scale[smallest] = tmp2;
+                        }
+                }
+
+                int a = 0;
+                while( a<gvl && out_scale[a] == 0 )
+                        ++a;
+
+                if( a < gvl ) 
+                {
+                        ssq = out_ssq[a];
+                        scale = out_scale[a];
+                        ++a;
+                        for( ; a < gvl; ++a ) 
+                        {
+                                ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
+                                scale = out_scale[a];
+                        }
+                }
+        }
+
+        //finish any tail using scalar ops
+        i*=gvl*inc_x;
+        n*=inc_x;
+        while(i < n){
+                if ( x[i] != 0.0 ){
+                        FLOAT absxi = ABS( x[i] );
+                        if ( scale < absxi ){
+                                ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+                                scale = absxi ;
+                        }
+                        else{
+                                ssq += ( absxi/scale ) * ( absxi/scale );
+                        }
+
+                }
+
+                i += inc_x;
+        }
+
+	return(scale * sqrt(ssq));
+}
+
+
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
index 2a83e2a52..5c03fbec7 100644
--- a/kernel/riscv64/nrm2_vector.c
+++ b/kernel/riscv64/nrm2_vector.c
@@ -26,207 +26,189 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define VFMVFS_FLOATM4 vfmv_f_s_f32m4_f32
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define ABS fabsf
-#define MASK_T vbool8_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
-#define VMFIRSTM vmfirst_m_b8
-#define VFDIVVF_FLOAT vfdiv_vf_f32m4
-#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m1
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 64
+#       else
+#               define ELEN 32
+#               define MLEN 32
+#       endif
+#else
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 16
+#       else
+#               define ELEN 32
+#               define MLEN 8
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVSF_FLOAT    JOIN(RISCV_RVV(vfmv),      _s_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFABS           JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VMFNE           JOIN(RISCV_RVV(vmfne_vf_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFGT           JOIN(RISCV_RVV(vmfgt_vv_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFEQ           JOIN(RISCV_RVV(vmfeq_vf_f),ELEN,   LMUL,   _b,     MLEN)
+#define VCPOP           JOIN(RISCV_RVV(vcpop),     _m_b,   MLEN,   _,      _)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _m)
+#define VFMUL_M         JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _m)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _m)
+#define VMERGE(a, b, mask, gvl)       JOIN(RISCV_RVV(vmerge),    _vvm_f, ELEN,   LMUL,   _)(mask, a, b, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define VFMVFS_FLOATM4 vfmv_f_s_f64m4_f64
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMUL_M         JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _mu)
+#define VMERGE          JOIN(RISCV_RVV(vmerge),    _vvm_f, ELEN,   LMUL,   _)
+#endif
+#define VFIRST          JOIN(RISCV_RVV(vfirst),    _m_b,   MLEN,   _,      _)
+#define VRGATHER        JOIN(RISCV_RVV(vrgather),  _vx_f,  ELEN,   LMUL,   _)
+#define VFDIV           JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _)
+#define VFMUL           JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _)
+#define VFMACC          JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _)
+#define VMSBF           JOIN(RISCV_RVV(vmsbf),     _m_b,   MLEN,   _,      _)
+#define VMSOF           JOIN(RISCV_RVV(vmsof),     _m_b,   MLEN,   _,      _)
+#define VMAND           JOIN(RISCV_RVV(vmand),     _mm_b,  MLEN,   _,      _)
+#define VMANDN          JOIN(RISCV_RVV(vmandn),    _mm_b,  MLEN,   _,      _)
+
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+
+#if defined(DOUBLE)
 #define ABS fabs
-#define MASK_T vbool16_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
-#define VMFIRSTM vmfirst_m_b16
-#define VFDIVVF_FLOAT vfdiv_vf_f64m4
-#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
+#else
+#define ABS fabsf
 #endif
 
+#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v)
+
+//#define DUMP( label, v0, gvl )
+#define DUMP( label, v0, gvl ) do{ FLOAT x[16]; VSEV_FLOAT( x, v0, gvl ); printf ("%s(%d): %s [ ", __FILE__, __LINE__, label); for( int xxx = 0; xxx < gvl; ++xxx ) { printf("%f, ", x[xxx]); } printf(" ]\n"); } while(0)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
+	BLASLONG i=0;
 
-	if ( n < 0 )  return(0.0);
+	if (n <= 0 || inc_x <= 0) return(0.0);
         if(n == 1) return (ABS(x[0]));
 
-        FLOAT_V_T vr, v0, v_zero;
         unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
-
-        FLOAT scale = 0.0, ssq = 0.0;
-        MASK_T mask;
-        BLASLONG index = 0;
-        if(inc_x == 1){
-                gvl = VSETVL(n);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                for(i=0,j=0; i<n/gvl; i++){
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+
+        MASK_T nonzero_mask;
+        MASK_T scale_mask;
+
+        gvl = VSETVL(n);
+        FLOAT_V_T v0;
+        FLOAT_V_T v_ssq = VFMVVF_FLOAT(0, gvl);
+        FLOAT_V_T v_scale = VFMVVF_FLOAT(0, gvl);
+
+        FLOAT scale = 0;
+        FLOAT ssq = 0;
+        unsigned int stride_x = inc_x * sizeof(FLOAT);
+        int idx = 0;
+
+        if( n >= gvl ) // don't pay overheads if we're not doing useful work
+        {
+                for(i=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
+                        nonzero_mask = VMFNE( v0, 0, gvl );
+                        v0 = VFABS( v0, gvl );
+                        scale_mask = VMFGT( v0, v_scale, gvl );
+
+                        // assume scale changes are relatively infrequent
+
+                        // unclear if the vcpop+branch is actually a win
+                        // since the operations being skipped are predicated anyway
+                        // need profiling to confirm
+                        if( VCPOP(scale_mask, gvl) ) 
+                        {
+                                v_scale = VFDIV_M( scale_mask, v_scale, v_scale, v0, gvl );
+                                v_scale = VFMUL_M( scale_mask, v_scale, v_scale, v_scale, gvl );
+                                v_ssq = VFMUL_M( scale_mask, v_ssq, v_ssq, v_scale, gvl );
+                                v_scale = VMERGE( v_scale, v0, scale_mask, gvl );
                         }
-                        j += gvl;
+                        v0 = VFDIV_M( nonzero_mask, v0, v0, v_scale, gvl );
+                        v_ssq = VFMACC_M( nonzero_mask, v_ssq, v0, v0, gvl );
+                        idx += inc_x * gvl;
                 }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
-
-                //tail
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0)
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+
+                // we have gvl elements which we accumulated independently, with independent scales
+                // we need to combine these
+                // naive sort so we process small values first to avoid losing information
+                // could use vector sort extensions where available, but we're dealing with gvl elts at most
+
+                FLOAT * out_ssq = alloca(gvl*sizeof(FLOAT));
+                FLOAT * out_scale = alloca(gvl*sizeof(FLOAT));
+                VSEV_FLOAT( out_ssq, v_ssq, gvl );
+                VSEV_FLOAT( out_scale, v_scale, gvl );
+                for( int a = 0; a < (gvl-1); ++a )
+                {
+                        int smallest = a;
+                        for( size_t b = a+1; b < gvl; ++b )
+                                if( out_scale[b] < out_scale[smallest] )
+                                        smallest = b;
+                        if( smallest != a )
+                        {
+                                FLOAT tmp1 = out_ssq[a];
+                                FLOAT tmp2 = out_scale[a];
+                                out_ssq[a] = out_ssq[smallest];
+                                out_scale[a] = out_scale[smallest];
+                                out_ssq[smallest] = tmp1;
+                                out_scale[smallest] = tmp2;
                         }
-                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
                 }
-        }else{
-                gvl = VSETVL(n);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                unsigned int stride_x = inc_x * sizeof(FLOAT);
-                int idx = 0, inc_v = inc_x * gvl;
-                for(i=0,j=0; i<n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+
+                int a = 0;
+                while( a<gvl && out_scale[a] == 0 )
+                        ++a;
+
+                if( a < gvl ) 
+                {
+                        ssq = out_ssq[a];
+                        scale = out_scale[a];
+                        ++a;
+                        for( ; a < gvl; ++a ) 
+                        {
+                                ssq = ssq * ( scale / out_scale[a] ) * ( scale / out_scale[a] ) + out_ssq[a];
+                                scale = out_scale[a];
                         }
-                        j += gvl;
-                        idx += inc_v;
                 }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
-
-                //tail
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0)
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOATM4(vr);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+        }
+
+        //finish any tail using scalar ops
+        i*=gvl*inc_x;
+        n*=inc_x;
+        while(i < n){
+                if ( x[i] != 0.0 ){
+                        FLOAT absxi = ABS( x[i] );
+                        if ( scale < absxi ){
+                                ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+                                scale = absxi ;
+                        }
+                        else{
+                                ssq += ( absxi/scale ) * ( absxi/scale );
                         }
-                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
+
                 }
+
+                i += inc_x;
         }
+
 	return(scale * sqrt(ssq));
 }
 
diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c
index a8b2b7c66..dfa13a6f5 100644
--- a/kernel/riscv64/nrm2_vector_dot.c
+++ b/kernel/riscv64/nrm2_vector_dot.c
@@ -31,9 +31,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e32m1()
 #define FLOAT_V_T vfloat32m8_t
 #define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT vle32_v_f32m8
+#define VLSEV_FLOAT vlse32_v_f32m8
 #define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
 #define VFREDSUM_FLOAT vfredusum_vs_f32m8_f32m1
 #define VFMACCVV_FLOAT vfmacc_vv_f32m8
 #define VFMVVF_FLOAT vfmv_v_f_f32m8
@@ -45,9 +45,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define VSETVL_MAX vsetvlmax_e64m1()
 #define FLOAT_V_T vfloat64m8_t
 #define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT vle64_v_f64m8
+#define VLSEV_FLOAT vlse64_v_f64m8
 #define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
 #define VFREDSUM_FLOAT vfredusum_vs_f64m8_f64m1
 #define VFMACCVV_FLOAT vfmacc_vv_f64m8
 #define VFMVVF_FLOAT vfmv_v_f_f64m8
@@ -61,7 +61,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG i=0, j=0;
 	double len = 0.0 ;
 
-	if ( n < 0 )  return(0.0);
+	if ( n <= 0 )  return(0.0);
         if(n == 1) return (ABS(x[0]));
 
         FLOAT_V_T vr, v0, v1;
diff --git a/kernel/riscv64/rot_rvv.c b/kernel/riscv64/rot_rvv.c
new file mode 100644
index 000000000..90f81d5e2
--- /dev/null
+++ b/kernel/riscv64/rot_rvv.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m8
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+
+    if(n <= 0) return(0);
+
+    FLOAT_V_T v0, v1, vx, vy;
+
+    if (inc_x == 0 || inc_y == 0) {
+        BLASLONG i=0;
+        BLASLONG ix=0,iy=0;
+        FLOAT temp;
+        while(i < n)
+        {
+            temp   = c*x[ix] + s*y[iy] ;
+            y[iy]  = c*y[iy] - s*x[ix] ;
+            x[ix]  = temp ;
+
+            ix += inc_x ;
+            iy += inc_y ;
+            i++ ;
+        }
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLEV_FLOAT(y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSEV_FLOAT(x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSEV_FLOAT(y, v1, vl);
+        }
+
+    } else if(inc_y == 1) {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLEV_FLOAT(y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSSEV_FLOAT(x, stride_x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSEV_FLOAT(y, v1, vl);
+        }
+ 
+    } else if(inc_x == 1) {
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSEV_FLOAT(x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSSEV_FLOAT(y, stride_y, v1, vl);
+        }
+
+    } else {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+ 
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+
+            v0 = VFMULVF_FLOAT(vx, c, vl);
+            v0 = VFMACCVF_FLOAT(v0, s, vy, vl);
+            VSSEV_FLOAT(x, stride_x, v0, vl);
+
+            v1 = VFMULVF_FLOAT(vx, s, vl);
+            v1 = VFMSACVF_FLOAT(v1, c, vy, vl);
+            VSSEV_FLOAT(y, stride_y, v1, vl);
+        }
+
+    }
+    
+    return(0);
+}
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
index 9b48d1c69..649d9bb94 100644
--- a/kernel/riscv64/rot_vector.c
+++ b/kernel/riscv64/rot_vector.c
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFMSACVF_FLOAT vfmsac_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFMSACVF_FLOAT vfmsac_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -57,11 +57,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 	BLASLONG ix=0,iy=0;
 
 	if(n <= 0)  return(0);
-        unsigned int gvl = 0;
+        unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
         FLOAT_V_T v0, v1, vx, vy;
 
         if(inc_x == 1 && inc_y == 1){
-                gvl = VSETVL(n);
                 for(i=0,j=0; i<n/gvl; i++){
                         vx = VLEV_FLOAT(&x[j], gvl);
                         vy = VLEV_FLOAT(&y[j], gvl);
@@ -90,7 +89,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSEV_FLOAT(&y[j], v1, gvl);
                 }
         }else if(inc_y == 1){
-                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
                 for(i=0,j=0; i<n/gvl; i++){
@@ -122,7 +120,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSEV_FLOAT(&y[j], v1, gvl);
                 }
         }else if(inc_x == 1){
-                gvl = VSETVL(n);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_yv = inc_y * gvl;
                 for(i=0,j=0; i<n/gvl; i++){
@@ -154,7 +151,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
                         VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
                 }
         }else{
-                gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * sizeof(FLOAT);
                 BLASLONG stride_y = inc_y * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl;
diff --git a/kernel/riscv64/scal_rvv.c b/kernel/riscv64/scal_rvv.c
new file mode 100644
index 000000000..2c273fb63
--- /dev/null
+++ b/kernel/riscv64/scal_rvv.c
@@ -0,0 +1,97 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    if ( (n <= 0) || (inc_x <= 0)) return(0);
+
+    FLOAT_V_T v0;
+ 
+    if(inc_x == 1) {
+        if(da == 0.0) {
+            int gvl = VSETVL_MAX;
+            v0 = VFMVVF_FLOAT(0.0, gvl);
+            for (size_t vl; n > 0; n -= vl, x += vl) {
+                vl = VSETVL(n);
+                VSEV_FLOAT(x, v0, vl);
+            }
+        }
+        else {
+            for (size_t vl; n > 0; n -= vl, x += vl) {
+                vl = VSETVL(n);
+                v0 = VLEV_FLOAT(x, vl);
+                v0 = VFMULVF_FLOAT(v0, da, vl);
+                VSEV_FLOAT(x, v0, vl);
+            }
+        }
+    } else {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        if(da == 0.0) {
+            int gvl = VSETVL_MAX;
+            v0 = VFMVVF_FLOAT(0.0, gvl);
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+                vl = VSETVL(n);
+                VSSEV_FLOAT(x, stride_x, v0, vl);
+            }
+        }
+        else {
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+                vl = VSETVL(n);
+                v0 = VLSEV_FLOAT(x, stride_x, vl);
+                v0 = VFMULVF_FLOAT(v0, da, vl);
+                VSSEV_FLOAT(x, stride_x, v0, vl);
+            }
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
index 7a3153b7c..8fa9315f6 100644
--- a/kernel/riscv64/scal_vector.c
+++ b/kernel/riscv64/scal_vector.c
@@ -26,28 +26,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
-#define VFMULVF_FLOAT vfmul_vf_f32m8
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
-#define VFMULVF_FLOAT vfmul_vf_f64m8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMULVF_FLOAT   JOIN(RISCV_RVV(vfmul),     _vf_f,  ELEN,   LMUL,   _)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0,j=0;
@@ -84,25 +97,25 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
                 }
         }else{
                 if(da == 0.0){
+                        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                        BLASLONG ix = 0;
                         gvl = VSETVL(n);
-						BLASLONG stride_x = inc_x * sizeof(FLOAT);
-						BLASLONG ix = 0;
-                        if(gvl <= n / 2){
-							    long int inc_xv = gvl * inc_x;
-                                v0 = VFMVVF_FLOAT(0, gvl);
-                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
-									VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
-									VSSEV_FLOAT(&x[ix + inc_xv], stride_x, v0, gvl);
-									ix += inc_xv * 2;
-                                }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+
+                        for(i = 0; i < n/(gvl*2); ++i ){
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                ix += inc_x * gvl;
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                ix += inc_x * gvl;
                         }
-                        //tail
-                        for(; j <n; ){
-                                gvl = VSETVL(n-j);
+
+                        i *= gvl*2;
+                        while( i < n ){
+                                gvl = VSETVL(n-i);
                                 v0 = VFMVVF_FLOAT(0, gvl);
-								VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
-                                j += gvl;
-								ix += inc_x * gvl;
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                i += gvl;
+                                ix += inc_x * gvl;
                         }
                 }else{
                         gvl = VSETVL(n);
diff --git a/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
new file mode 100644
index 000000000..e22df34f9
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_16x8_zvl256b.c
@@ -0,0 +1,1081 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=16
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='sgemm_kernel_16x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+            vfloat32m1_t result8 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result9 = __riscv_vfmul_vf_f32m1( A1, B4, gvl);
+            vfloat32m1_t result10 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result11 = __riscv_vfmul_vf_f32m1( A1, B5, gvl);
+            vfloat32m1_t result12 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result13 = __riscv_vfmul_vf_f32m1( A1, B6, gvl);
+            vfloat32m1_t result14 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+            vfloat32m1_t result15 = __riscv_vfmul_vf_f32m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f32m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f32m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f32m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f32m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f32m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f32m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f32m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f32m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c8 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c9 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c10 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c11 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c12 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c13 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c14 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c15 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+            c8 = __riscv_vfmacc_vf_f32m1( c8, alpha, result8, gvl );
+            c9 = __riscv_vfmacc_vf_f32m1( c9, alpha, result9, gvl );
+            c10 = __riscv_vfmacc_vf_f32m1( c10, alpha, result10, gvl );
+            c11 = __riscv_vfmacc_vf_f32m1( c11, alpha, result11, gvl );
+            c12 = __riscv_vfmacc_vf_f32m1( c12, alpha, result12, gvl );
+            c13 = __riscv_vfmacc_vf_f32m1( c13, alpha, result13, gvl );
+            c14 = __riscv_vfmacc_vf_f32m1( c14, alpha, result14, gvl );
+            c15 = __riscv_vfmacc_vf_f32m1( c15, alpha, result15, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c15, gvl);
+            m_top += 16;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            C[ci+4*ldc+0] += alpha * result8;
+            C[ci+4*ldc+1] += alpha * result9;
+            C[ci+5*ldc+0] += alpha * result10;
+            C[ci+5*ldc+1] += alpha * result11;
+            C[ci+6*ldc+0] += alpha * result12;
+            C[ci+6*ldc+1] += alpha * result13;
+            C[ci+7*ldc+0] += alpha * result14;
+            C[ci+7*ldc+1] += alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            C[ci+4*ldc+0] += alpha * result4;
+            C[ci+5*ldc+0] += alpha * result5;
+            C[ci+6*ldc+0] += alpha * result6;
+            C[ci+7*ldc+0] += alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c4 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c5 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c6 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c7 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+            c4 = __riscv_vfmacc_vf_f32m1( c4, alpha, result4, gvl );
+            c5 = __riscv_vfmacc_vf_f32m1( c5, alpha, result5, gvl );
+            c6 = __riscv_vfmacc_vf_f32m1( c6, alpha, result6, gvl );
+            c7 = __riscv_vfmacc_vf_f32m1( c7, alpha, result7, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            C[ci+2*ldc+0] += alpha * result4;
+            C[ci+2*ldc+1] += alpha * result5;
+            C[ci+3*ldc+0] += alpha * result6;
+            C[ci+3*ldc+1] += alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            C[ci+2*ldc+0] += alpha * result2;
+            C[ci+3*ldc+0] += alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*1;
+            vfloat32m1_t c2 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c3 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+            c2 = __riscv_vfmacc_vf_f32m1( c2, alpha, result2, gvl );
+            c3 = __riscv_vfmacc_vf_f32m1( c3, alpha, result3, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += ldc-gvl*0;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            C[ci+1*ldc+0] += alpha * result2;
+            C[ci+1*ldc+1] += alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+1*ldc+0] += alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl); ci += gvl;
+            vfloat32m1_t c1 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+            c1 = __riscv_vfmacc_vf_f32m1( c1, alpha, result1, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vle32_v_f32m1( &C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m1( c0, alpha, result0, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            C[ci+0*ldc+1] += alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] += alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
new file mode 100644
index 000000000..ad720e694
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_8x8_zvl128b.c
@@ -0,0 +1,791 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='sgemm_kernel_8x8_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 8; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
+            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
+            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
+            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c4 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c5 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c6 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c7 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+            c4 = __riscv_vfmacc_vf_f32m2(c4, alpha, result4, gvl);
+            c5 = __riscv_vfmacc_vf_f32m2(c5, alpha, result5, gvl);
+            c6 = __riscv_vfmacc_vf_f32m2(c6, alpha, result6, gvl);
+            c7 = __riscv_vfmacc_vf_f32m2(c7, alpha, result7, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                result8 += A[ai + 0] * B[bi + 4];
+                result9 += A[ai + 1] * B[bi + 4];
+                result10 += A[ai + 0] * B[bi + 5];
+                result11 += A[ai + 1] * B[bi + 5];
+                result12 += A[ai + 0] * B[bi + 6];
+                result13 += A[ai + 1] * B[bi + 6];
+                result14 += A[ai + 0] * B[bi + 7];
+                result15 += A[ai + 1] * B[bi + 7];
+                ai += 2;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            C[ci + 2 * ldc + 0] += alpha * result4;
+            C[ci + 2 * ldc + 1] += alpha * result5;
+            C[ci + 3 * ldc + 0] += alpha * result6;
+            C[ci + 3 * ldc + 1] += alpha * result7;
+            C[ci + 4 * ldc + 0] += alpha * result8;
+            C[ci + 4 * ldc + 1] += alpha * result9;
+            C[ci + 5 * ldc + 0] += alpha * result10;
+            C[ci + 5 * ldc + 1] += alpha * result11;
+            C[ci + 6 * ldc + 0] += alpha * result12;
+            C[ci + 6 * ldc + 1] += alpha * result13;
+            C[ci + 7 * ldc + 0] += alpha * result14;
+            C[ci + 7 * ldc + 1] += alpha * result15;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                result4 += A[ai + 0] * B[bi + 4];
+                result5 += A[ai + 0] * B[bi + 5];
+                result6 += A[ai + 0] * B[bi + 6];
+                result7 += A[ai + 0] * B[bi + 7];
+                ai += 1;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            C[ci + 2 * ldc + 0] += alpha * result2;
+            C[ci + 3 * ldc + 0] += alpha * result3;
+            C[ci + 4 * ldc + 0] += alpha * result4;
+            C[ci + 5 * ldc + 0] += alpha * result5;
+            C[ci + 6 * ldc + 0] += alpha * result6;
+            C[ci + 7 * ldc + 0] += alpha * result7;
+            m_top += 1;
+        }
+
+        n_top += 8;
+    }
+
+    // -- tails for N=4
+
+    if (N & 4) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c2 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c3 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+            c2 = __riscv_vfmacc_vf_f32m2(c2, alpha, result2, gvl);
+            c3 = __riscv_vfmacc_vf_f32m2(c3, alpha, result3, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            C[ci + 2 * ldc + 0] += alpha * result4;
+            C[ci + 2 * ldc + 1] += alpha * result5;
+            C[ci + 3 * ldc + 0] += alpha * result6;
+            C[ci + 3 * ldc + 1] += alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            C[ci + 2 * ldc + 0] += alpha * result2;
+            C[ci + 3 * ldc + 0] += alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            ci += ldc - gvl * 0;
+            vfloat32m2_t c1 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+            c1 = __riscv_vfmacc_vf_f32m2(c1, alpha, result1, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            C[ci + 1 * ldc + 0] += alpha * result2;
+            C[ci + 1 * ldc + 1] += alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 1 * ldc + 0] += alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vle32_v_f32m2(&C[ci], gvl);
+            c0 = __riscv_vfmacc_vf_f32m2(c0, alpha, result0, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            C[ci + 0 * ldc + 1] += alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] += alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/strmm_kernel_16x8_zvl256b.c b/kernel/riscv64/strmm_kernel_16x8_zvl256b.c
new file mode 100644
index 000000000..f70157d61
--- /dev/null
+++ b/kernel/riscv64/strmm_kernel_16x8_zvl256b.c
@@ -0,0 +1,1330 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=16
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f32m1'
+ VFMUL='__riscv_vfmul_vf_f32m1'
+ VLEV='__riscv_vle32_v_f32m1'
+ VLSEV='__riscv_vlse32_v_f32m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m1'
+ VSETVL='__riscv_vsetvl_e32m1'
+ VSEV='__riscv_vse32_v_f32m1'
+ VSSEV='__riscv_vsse32_v_f32m1'
+ acc_vector_t='vfloat32m1_t'
+ output='strmm_kernel_16x8_zvl256b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/8; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m1(8);
+
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+            vfloat32m1_t result8 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result9 = __riscv_vfmul_vf_f32m1( A1, B4, gvl);
+            vfloat32m1_t result10 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result11 = __riscv_vfmul_vf_f32m1( A1, B5, gvl);
+            vfloat32m1_t result12 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result13 = __riscv_vfmul_vf_f32m1( A1, B6, gvl);
+            vfloat32m1_t result14 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+            vfloat32m1_t result15 = __riscv_vfmul_vf_f32m1( A1, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+                result8 = __riscv_vfmacc_vf_f32m1( result8, B4, A0, gvl);
+                result9 = __riscv_vfmacc_vf_f32m1( result9, B4, A1, gvl);
+                result10 = __riscv_vfmacc_vf_f32m1( result10, B5, A0, gvl);
+                result11 = __riscv_vfmacc_vf_f32m1( result11, B5, A1, gvl);
+                result12 = __riscv_vfmacc_vf_f32m1( result12, B6, A0, gvl);
+                result13 = __riscv_vfmacc_vf_f32m1( result13, B6, A1, gvl);
+                result14 = __riscv_vfmacc_vf_f32m1( result14, B7, A0, gvl);
+                result15 = __riscv_vfmacc_vf_f32m1( result15, B7, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            vfloat32m1_t c8 = __riscv_vfmul_vf_f32m1( result8, alpha, gvl );
+            vfloat32m1_t c9 = __riscv_vfmul_vf_f32m1( result9, alpha, gvl );
+            vfloat32m1_t c10 = __riscv_vfmul_vf_f32m1( result10, alpha, gvl );
+            vfloat32m1_t c11 = __riscv_vfmul_vf_f32m1( result11, alpha, gvl );
+            vfloat32m1_t c12 = __riscv_vfmul_vf_f32m1( result12, alpha, gvl );
+            vfloat32m1_t c13 = __riscv_vfmul_vf_f32m1( result13, alpha, gvl );
+            vfloat32m1_t c14 = __riscv_vfmul_vf_f32m1( result14, alpha, gvl );
+            vfloat32m1_t c15 = __riscv_vfmul_vf_f32m1( result15, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c8, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c9, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c10, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c11, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c12, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c13, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c14, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c15, gvl);
+            m_top += 16;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            float B4 = B[bi+4];
+            float B5 = B[bi+5];
+            float B6 = B[bi+6];
+            float B7 = B[bi+7];
+            bi += 8;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B4, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A0, B5, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B6, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A0, B7, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                B4 = B[bi+4];
+                B5 = B[bi+5];
+                B6 = B[bi+6];
+                B7 = B[bi+7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B7, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                result8+=A[ai+0]*B[bi+4];
+                result9+=A[ai+1]*B[bi+4];
+                result10+=A[ai+0]*B[bi+5];
+                result11+=A[ai+1]*B[bi+5];
+                result12+=A[ai+0]*B[bi+6];
+                result13+=A[ai+1]*B[bi+6];
+                result14+=A[ai+0]*B[bi+7];
+                result15+=A[ai+1]*B[bi+7];
+                ai+=2;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            C[ci+4*ldc+0] = alpha * result8;
+            C[ci+4*ldc+1] = alpha * result9;
+            C[ci+5*ldc+0] = alpha * result10;
+            C[ci+5*ldc+1] = alpha * result11;
+            C[ci+6*ldc+0] = alpha * result12;
+            C[ci+6*ldc+1] = alpha * result13;
+            C[ci+7*ldc+0] = alpha * result14;
+            C[ci+7*ldc+1] = alpha * result15;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*8;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 8;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                result4+=A[ai+0]*B[bi+4];
+                result5+=A[ai+0]*B[bi+5];
+                result6+=A[ai+0]*B[bi+6];
+                result7+=A[ai+0]*B[bi+7];
+                ai+=1;
+                bi+=8;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            C[ci+4*ldc+0] = alpha * result4;
+            C[ci+5*ldc+0] = alpha * result5;
+            C[ci+6*ldc+0] = alpha * result6;
+            C[ci+7*ldc+0] = alpha * result7;
+            m_top+=1;
+        }
+
+        n_top += 8;
+    }
+
+
+
+    // -- tails for N=4
+
+    if( N & 4 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+            vfloat32m1_t result4 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result5 = __riscv_vfmul_vf_f32m1( A1, B2, gvl);
+            vfloat32m1_t result6 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+            vfloat32m1_t result7 = __riscv_vfmul_vf_f32m1( A1, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+                result4 = __riscv_vfmacc_vf_f32m1( result4, B2, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m1( result5, B2, A1, gvl);
+                result6 = __riscv_vfmacc_vf_f32m1( result6, B3, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m1( result7, B3, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            vfloat32m1_t c4 = __riscv_vfmul_vf_f32m1( result4, alpha, gvl );
+            vfloat32m1_t c5 = __riscv_vfmul_vf_f32m1( result5, alpha, gvl );
+            vfloat32m1_t c6 = __riscv_vfmul_vf_f32m1( result6, alpha, gvl );
+            vfloat32m1_t c7 = __riscv_vfmul_vf_f32m1( result7, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c4, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c5, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c6, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c7, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            float B2 = B[bi+2];
+            float B3 = B[bi+3];
+            bi += 4;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B2, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A0, B3, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                B2 = B[bi+2];
+                B3 = B[bi+3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B3, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                result4+=A[ai+0]*B[bi+2];
+                result5+=A[ai+1]*B[bi+2];
+                result6+=A[ai+0]*B[bi+3];
+                result7+=A[ai+1]*B[bi+3];
+                ai+=2;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            C[ci+2*ldc+0] = alpha * result4;
+            C[ci+2*ldc+1] = alpha * result5;
+            C[ci+3*ldc+0] = alpha * result6;
+            C[ci+3*ldc+1] = alpha * result7;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*4;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                result2+=A[ai+0]*B[bi+2];
+                result3+=A[ai+0]*B[bi+3];
+                ai+=1;
+                bi+=4;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            C[ci+2*ldc+0] = alpha * result2;
+            C[ci+3*ldc+0] = alpha * result3;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+            vfloat32m1_t result2 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+            vfloat32m1_t result3 = __riscv_vfmul_vf_f32m1( A1, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+                result2 = __riscv_vfmacc_vf_f32m1( result2, B1, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m1( result3, B1, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            vfloat32m1_t c2 = __riscv_vfmul_vf_f32m1( result2, alpha, gvl );
+            vfloat32m1_t c3 = __riscv_vfmul_vf_f32m1( result3, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl); ci += ldc-gvl*1;
+            __riscv_vse32_v_f32m1( &C[ci], c2, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c3, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            float B1 = B[bi+1];
+            bi += 2;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A0, B1, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                B1 = B[bi+1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B1, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                result2+=A[ai+0]*B[bi+1];
+                result3+=A[ai+1]*B[bi+1];
+                ai+=2;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            C[ci+1*ldc+0] = alpha * result2;
+            C[ci+1*ldc+1] = alpha * result3;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+0]*B[bi+1];
+                ai+=1;
+                bi+=2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+1*ldc+0] = alpha * result1;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e32m1(8);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/16; i+=1) {
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*16;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 16;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            vfloat32m1_t A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+            ai += 16;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+            vfloat32m1_t result1 = __riscv_vfmul_vf_f32m1( A1, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                A1 = __riscv_vle32_v_f32m1( &A[ai+1*gvl], gvl );
+                ai += 16;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m1( result1, B0, A1, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            vfloat32m1_t c1 = __riscv_vfmul_vf_f32m1( result1, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl); ci += gvl;
+            __riscv_vse32_v_f32m1( &C[ci], c1, gvl);
+            m_top += 16;
+        }
+
+
+        if( M & 8 ) {
+            gvl = __riscv_vsetvl_e32m1(8);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 8;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e32m1(4);
+
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            float B0 = B[bi+0];
+            bi += 1;
+
+            vfloat32m1_t A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+            ai += 4;
+
+            vfloat32m1_t result0 = __riscv_vfmul_vf_f32m1( A0, B0, gvl);
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0 = B[bi+0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m1( &A[ai+0*gvl], gvl );
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m1( result0, B0, A0, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat32m1_t c0 = __riscv_vfmul_vf_f32m1( result0, alpha, gvl );
+            __riscv_vse32_v_f32m1( &C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                result1+=A[ai+1]*B[bi+0];
+                ai+=2;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            C[ci+0*ldc+1] = alpha * result1;
+            m_top+=2;
+        }
+
+
+        if( M & 1 ) {
+            float result0 = 0;
+            BLASLONG ai=m_top*K;
+            BLASLONG bi=n_top*K;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1;
+                bi += off*1;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=A[ai+0]*B[bi+0];
+                ai+=1;
+                bi+=1;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            C[ci+0*ldc+0] = alpha * result0;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/strmm_kernel_8x8_zvl128b.c b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c
new file mode 100644
index 000000000..ef18f036c
--- /dev/null
+++ b/kernel/riscv64/strmm_kernel_8x8_zvl128b.c
@@ -0,0 +1,991 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=8
+ M_tail_scalar_from=2
+ N=8
+ __riscv_='__riscv_'
+ complex=False
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='float'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=32
+ ELEN_PARAM=32
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f32m2'
+ VFMUL='__riscv_vfmul_vf_f32m2'
+ VLEV='__riscv_vle32_v_f32m2'
+ VLSEV='__riscv_vlse32_v_f32m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f32m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f32m2'
+ VSETVL='__riscv_vsetvl_e32m2'
+ VSEV='__riscv_vse32_v_f32m2'
+ VSSEV='__riscv_vsse32_v_f32m2'
+ acc_vector_t='vfloat32m2_t'
+ output='strmm_kernel_8x8_zvl128b.c'
+ param_scalar_t='float'
+ param_vector_t='vfloat32m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 8; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e32m2(8);
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl);
+            vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl);
+            vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl);
+            vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 8;
+        }
+
+        // -- tails for main pass
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            float B4 = B[bi + 4];
+            float B5 = B[bi + 5];
+            float B6 = B[bi + 6];
+            float B7 = B[bi + 7];
+            bi += 8;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+            vfloat32m2_t result4 = __riscv_vfmul_vf_f32m2(A0, B4, gvl);
+            vfloat32m2_t result5 = __riscv_vfmul_vf_f32m2(A0, B5, gvl);
+            vfloat32m2_t result6 = __riscv_vfmul_vf_f32m2(A0, B6, gvl);
+            vfloat32m2_t result7 = __riscv_vfmul_vf_f32m2(A0, B7, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                B4 = B[bi + 4];
+                B5 = B[bi + 5];
+                B6 = B[bi + 6];
+                B7 = B[bi + 7];
+                bi += 8;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+                result4 = __riscv_vfmacc_vf_f32m2(result4, B4, A0, gvl);
+                result5 = __riscv_vfmacc_vf_f32m2(result5, B5, A0, gvl);
+                result6 = __riscv_vfmacc_vf_f32m2(result6, B6, A0, gvl);
+                result7 = __riscv_vfmacc_vf_f32m2(result7, B7, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            vfloat32m2_t c4 = __riscv_vfmul_vf_f32m2(result4, alpha, gvl);
+            vfloat32m2_t c5 = __riscv_vfmul_vf_f32m2(result5, alpha, gvl);
+            vfloat32m2_t c6 = __riscv_vfmul_vf_f32m2(result6, alpha, gvl);
+            vfloat32m2_t c7 = __riscv_vfmul_vf_f32m2(result7, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c4, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c5, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c6, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c7, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            float result8 = 0;
+            float result9 = 0;
+            float result10 = 0;
+            float result11 = 0;
+            float result12 = 0;
+            float result13 = 0;
+            float result14 = 0;
+            float result15 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                result8 += A[ai + 0] * B[bi + 4];
+                result9 += A[ai + 1] * B[bi + 4];
+                result10 += A[ai + 0] * B[bi + 5];
+                result11 += A[ai + 1] * B[bi + 5];
+                result12 += A[ai + 0] * B[bi + 6];
+                result13 += A[ai + 1] * B[bi + 6];
+                result14 += A[ai + 0] * B[bi + 7];
+                result15 += A[ai + 1] * B[bi + 7];
+                ai += 2;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            C[ci + 2 * ldc + 0] = alpha * result4;
+            C[ci + 2 * ldc + 1] = alpha * result5;
+            C[ci + 3 * ldc + 0] = alpha * result6;
+            C[ci + 3 * ldc + 1] = alpha * result7;
+            C[ci + 4 * ldc + 0] = alpha * result8;
+            C[ci + 4 * ldc + 1] = alpha * result9;
+            C[ci + 5 * ldc + 0] = alpha * result10;
+            C[ci + 5 * ldc + 1] = alpha * result11;
+            C[ci + 6 * ldc + 0] = alpha * result12;
+            C[ci + 6 * ldc + 1] = alpha * result13;
+            C[ci + 7 * ldc + 0] = alpha * result14;
+            C[ci + 7 * ldc + 1] = alpha * result15;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 8;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 8;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                result4 += A[ai + 0] * B[bi + 4];
+                result5 += A[ai + 0] * B[bi + 5];
+                result6 += A[ai + 0] * B[bi + 6];
+                result7 += A[ai + 0] * B[bi + 7];
+                ai += 1;
+                bi += 8;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            C[ci + 2 * ldc + 0] = alpha * result2;
+            C[ci + 3 * ldc + 0] = alpha * result3;
+            C[ci + 4 * ldc + 0] = alpha * result4;
+            C[ci + 5 * ldc + 0] = alpha * result5;
+            C[ci + 6 * ldc + 0] = alpha * result6;
+            C[ci + 7 * ldc + 0] = alpha * result7;
+            m_top += 1;
+        }
+
+        n_top += 8;
+    }
+
+    // -- tails for N=4
+
+    if (N & 4) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            float B2 = B[bi + 2];
+            float B3 = B[bi + 3];
+            bi += 4;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+            vfloat32m2_t result2 = __riscv_vfmul_vf_f32m2(A0, B2, gvl);
+            vfloat32m2_t result3 = __riscv_vfmul_vf_f32m2(A0, B3, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                B2 = B[bi + 2];
+                B3 = B[bi + 3];
+                bi += 4;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+                result2 = __riscv_vfmacc_vf_f32m2(result2, B2, A0, gvl);
+                result3 = __riscv_vfmacc_vf_f32m2(result3, B3, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            vfloat32m2_t c2 = __riscv_vfmul_vf_f32m2(result2, alpha, gvl);
+            vfloat32m2_t c3 = __riscv_vfmul_vf_f32m2(result3, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c2, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c3, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            float result4 = 0;
+            float result5 = 0;
+            float result6 = 0;
+            float result7 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                result4 += A[ai + 0] * B[bi + 2];
+                result5 += A[ai + 1] * B[bi + 2];
+                result6 += A[ai + 0] * B[bi + 3];
+                result7 += A[ai + 1] * B[bi + 3];
+                ai += 2;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            C[ci + 2 * ldc + 0] = alpha * result4;
+            C[ci + 2 * ldc + 1] = alpha * result5;
+            C[ci + 3 * ldc + 0] = alpha * result6;
+            C[ci + 3 * ldc + 1] = alpha * result7;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 4;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                result2 += A[ai + 0] * B[bi + 2];
+                result3 += A[ai + 0] * B[bi + 3];
+                ai += 1;
+                bi += 4;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            C[ci + 2 * ldc + 0] = alpha * result2;
+            C[ci + 3 * ldc + 0] = alpha * result3;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            float B1 = B[bi + 1];
+            bi += 2;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+            vfloat32m2_t result1 = __riscv_vfmul_vf_f32m2(A0, B1, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                B1 = B[bi + 1];
+                bi += 2;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+                result1 = __riscv_vfmacc_vf_f32m2(result1, B1, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            vfloat32m2_t c1 = __riscv_vfmul_vf_f32m2(result1, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vse32_v_f32m2(&C[ci], c1, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            float result2 = 0;
+            float result3 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                result2 += A[ai + 0] * B[bi + 1];
+                result3 += A[ai + 1] * B[bi + 1];
+                ai += 2;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            C[ci + 1 * ldc + 0] = alpha * result2;
+            C[ci + 1 * ldc + 1] = alpha * result3;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 0] * B[bi + 1];
+                ai += 1;
+                bi += 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 1 * ldc + 0] = alpha * result1;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e32m2(8);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 8; i += 1) {
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 8;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 8;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 8;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 8;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 8;
+        }
+
+        if (M & 4) {
+            gvl = __riscv_vsetvl_e32m2(4);
+
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            float B0 = B[bi + 0];
+            bi += 1;
+
+            vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+            ai += 4;
+
+            vfloat32m2_t result0 = __riscv_vfmul_vf_f32m2(A0, B0, gvl);
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0 = B[bi + 0];
+                bi += 1;
+
+                A0 = __riscv_vle32_v_f32m2(&A[ai + 0 * gvl], gvl);
+                ai += 4;
+
+                result0 = __riscv_vfmacc_vf_f32m2(result0, B0, A0, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat32m2_t c0 = __riscv_vfmul_vf_f32m2(result0, alpha, gvl);
+            __riscv_vse32_v_f32m2(&C[ci], c0, gvl);
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            float result0 = 0;
+            float result1 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                result1 += A[ai + 1] * B[bi + 0];
+                ai += 2;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            C[ci + 0 * ldc + 1] = alpha * result1;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            float result0 = 0;
+            BLASLONG ai = m_top * K;
+            BLASLONG bi = n_top * K;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1;
+            bi += off * 1;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += A[ai + 0] * B[bi + 0];
+                ai += 1;
+                bi += 1;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            C[ci + 0 * ldc + 0] = alpha * result0;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/sum_rvv.c b/kernel/riscv64/sum_rvv.c
new file mode 100644
index 000000000..c5629197f
--- /dev/null
+++ b/kernel/riscv64/sum_rvv.c
@@ -0,0 +1,95 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m8_tu
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m8_tu
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT sumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(sumf);
+
+    FLOAT_V_T vx, vsum;
+    FLOAT_V_T_M1 v_res;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vsum = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vsum = VFADDVV_FLOAT_TU(vsum, vsum, vx, vl);
+        }
+
+    }
+
+    v_res = VFREDSUMVS_FLOAT(vsum, v_res, vlmax);
+    sumf = VFMVFS_FLOAT_M1(v_res);
+    return(sumf);
+}
diff --git a/kernel/riscv64/sum_vector.c b/kernel/riscv64/sum_vector.c
new file mode 100644
index 000000000..cf734faab
--- /dev/null
+++ b/kernel/riscv64/sum_vector.c
@@ -0,0 +1,114 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m8)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
+#define FLOAT_V_T vfloat32m8_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m8)
+#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f32m8_f32m1)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f32m8)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m8)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
+#define FLOAT_V_T vfloat64m8_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m8)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m8)
+#define VFREDSUMVS_FLOAT RISCV_RVV(vfredusum_vs_f64m8_f64m1)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m8)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFADDVV_FLOAT RISCV_RVV(vfadd_vv_f64m8)
+#endif
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_sum;
+        FLOAT_V_T_M1 v_res;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+
+        if(inc_x == 1){
+                gvl = VSETVL(n);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+                for(;j<n;){
+                        gvl = VSETVL(n-j);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
+                        j += gvl;
+                }
+        }else{
+                gvl = VSETVL(n);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                                inc_xv += inc_xv * 2;
+                        }
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+                for(;j<n;){
+                        gvl = VSETVL(n-j);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
+                        j += gvl;
+                }
+        }
+        asumf = EXTRACT_FLOAT(v_res);
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c
index eac621fb2..33bbeeb6a 100644
--- a/kernel/riscv64/swap.c
+++ b/kernel/riscv64/swap.c
@@ -41,7 +41,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
 	BLASLONG ix=0,iy=0;
 	FLOAT temp;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	while(i < n)
 	{
diff --git a/kernel/riscv64/swap_rvv.c b/kernel/riscv64/swap_rvv.c
new file mode 100644
index 000000000..893d70554
--- /dev/null
+++ b/kernel/riscv64/swap_rvv.c
@@ -0,0 +1,138 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    BLASLONG stride_x, stride_y;
+    FLOAT_V_T vx, vy;
+
+    if (n <= 0) return(0);
+
+    if (inc_x == 0 && inc_y == 0) {
+        if (n & 1) {
+            FLOAT temp = x[0];
+            x[0] = y[0];
+            y[0] = temp;
+        }
+        else {
+            return 0;
+        }
+    }
+    else if(inc_x == 0) {
+        FLOAT temp = x[0];
+        x[0] = y[(n - 1) * inc_y];
+        FLOAT* ptr = y + (n - 1) * inc_y;   // start from the last one
+        stride_y = (0 - inc_y) * sizeof(FLOAT); // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_y) {
+            vl = VSETVL(m);
+            vy = VLSEV_FLOAT(ptr - 1, stride_y, vl);
+            VSSEV_FLOAT(ptr, stride_y, vy, vl);
+        }
+        y[0] = temp;
+    }
+    else if(inc_y == 0) {
+        FLOAT temp = y[0];
+        y[0] = x[(n - 1) * inc_x];
+        FLOAT* ptr = x + (n - 1) * inc_x;   // start from the last one
+        stride_x = (0 - inc_x) * sizeof(FLOAT); // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl, ptr -= vl*inc_x) {
+            vl = VSETVL(m);
+            vx = VLSEV_FLOAT(ptr - 1, stride_x, vl);
+            VSSEV_FLOAT(ptr, stride_x, vx, vl);
+        }
+        x[0] = temp;
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            VSEV_FLOAT(y, vx, vl);
+            VSEV_FLOAT(x, vy, vl);
+        }
+  
+    } else if (inc_y == 1) {
+        stride_x = inc_x * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLEV_FLOAT(y, vl);
+            VSEV_FLOAT(y, vx, vl);
+            VSSEV_FLOAT(x, stride_x, vy, vl);
+        }
+ 
+    } else if(inc_x == 1) {
+        stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLEV_FLOAT(x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            VSSEV_FLOAT(y, stride_y, vx, vl);
+            VSEV_FLOAT(x, vy, vl);
+        }
+ 
+    } else {
+        stride_x = inc_x * sizeof(FLOAT);
+        stride_y = inc_y * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x, y += vl*inc_y) {
+            vl = VSETVL(n);
+
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VLSEV_FLOAT(y, stride_y, vl);
+            VSSEV_FLOAT(y, stride_y, vx, vl);
+            VSSEV_FLOAT(x, stride_x, vy, vl);
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
index d9421e2f1..f583f5392 100644
--- a/kernel/riscv64/swap_vector.c
+++ b/kernel/riscv64/swap_vector.c
@@ -27,35 +27,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <stdio.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i = 0, j = 0;
 	BLASLONG ix = 0,iy = 0;
         BLASLONG stride_x, stride_y;
         FLOAT_V_T vx0, vx1, vy0, vy1;
-        unsigned int gvl = 0;
 
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
+
+        unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
+        if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
+
         if(inc_x == 1 && inc_y == 1){
-                gvl = VSETVL(n);
                 if(gvl <= n/2){
                         for(i=0,j=0; i<n/(2*gvl); i++){
                                 vx0 = VLEV_FLOAT(&x[j], gvl);
@@ -79,7 +96,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         j+=gvl;
                 }
         }else if (inc_y == 1){
-                gvl = VSETVL(n);
                 stride_x = inc_x * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_xv = inc_x * gvl;
@@ -107,7 +123,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         ix += inc_x * gvl;
                 }
         }else if(inc_x == 1){
-                gvl = VSETVL(n);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
                         BLASLONG inc_yv = inc_y * gvl;
@@ -135,7 +150,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
                         iy += inc_y * gvl;
                 }
         }else{
-                gvl = VSETVL(n);
                 stride_x = inc_x * sizeof(FLOAT);
                 stride_y = inc_y * sizeof(FLOAT);
                 if(gvl <= n/2){
diff --git a/kernel/riscv64/symm_lcopy_rvv_v1.c b/kernel/riscv64/symm_lcopy_rvv_v1.c
new file mode 100644
index 000000000..a615db44d
--- /dev/null
+++ b/kernel/riscv64/symm_lcopy_rvv_v1.c
@@ -0,0 +1,101 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#endif
+
+// Optimizes the implementation in ../generic/symm_lcopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+
+    FLOAT_V_T vb, va1, va2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posX + posY * lda;
+        ao2 = a + posY + (posX) * lda;
+
+        for (i = m; i > 0; i--, offset--) {
+            va2 = VLSEV_FLOAT(ao2, stride_lda, vl);
+            va1 = VLEV_FLOAT(ao1, vl);
+
+            // offset > (0 - vindex)   --->   (offset + vindex) > 0
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb =  VMERGE_VVM_FLOAT(va2, va1, vbool, vl);
+            VSEV_FLOAT(b, vb, vl);
+
+            b += vl;
+            ao1 += lda;
+            ao2++;
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/symm_ucopy_rvv_v1.c b/kernel/riscv64/symm_ucopy_rvv_v1.c
new file mode 100644
index 000000000..464f97b3a
--- /dev/null
+++ b/kernel/riscv64/symm_ucopy_rvv_v1.c
@@ -0,0 +1,100 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#endif
+
+// Optimizes the implementation in ../generic/symm_ucopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+    
+    FLOAT_V_T vb, va1, va2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posY + (posX + 0) * lda;
+        ao2 = a + posX + 0 + posY * lda;
+
+        for (i = m; i > 0; i--, offset--) {
+            va1 = VLSEV_FLOAT(ao1, stride_lda, vl);
+            va2 = VLEV_FLOAT(ao2, vl);
+
+            // offset > (0 - vindex)   --->   (offset + vindex) > 0
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb =  VMERGE_VVM_FLOAT(va2, va1, vbool, vl);
+            VSEV_FLOAT(b, vb, vl);
+
+            b += vl;
+            ao1++;
+            ao2 += lda;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/symv_L_rvv.c b/kernel/riscv64/symv_L_rvv.c
new file mode 100644
index 000000000..888d628a5
--- /dev/null
+++ b/kernel/riscv64/symv_L_rvv.c
@@ -0,0 +1,219 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT *a_ptr = a;
+
+        FLOAT_V_T_M1 v_res, v_z0;
+        size_t vlmax = VSETVL_MAX_M1, vl;
+        v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+        vlmax = VSETVL_MAX;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        if(inc_x == 1 && inc_y == 1)
+        {
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        y[j] += temp1 * a_ptr[j];
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+
+                        y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                }
+        }
+        else if(inc_x == 1)
+        {
+                jy = 0;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        y[jy] += temp1 * a_ptr[j];
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_yv = inc_y * vl;
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+
+                        y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }
+        else if(inc_y == 1)
+        {
+                jx = 0;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        y[j] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                ix += inc_xv;
+                        }
+
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+
+                        y[j] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        jx    += inc_x;
+                        a_ptr += lda;
+                }
+        }
+        else
+        {
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                jx = 0;
+                jy = 0;
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        y[jy] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        vr = VFMVVF_FLOAT(0, vlmax);
+                        for (k = (m-i); k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+                                inc_yv = inc_y * vl;
+                                
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vlmax);
+
+                        y[jy] += alpha * VFMVFS_FLOAT_M1(v_res);
+                        jx    += inc_x;
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }
+        return(0);
+}
+
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
index 6588f4dda..cd89c63ec 100644
--- a/kernel/riscv64/symv_L_vector.c
+++ b/kernel/riscv64/symv_L_vector.c
@@ -27,37 +27,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -99,8 +105,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         i += gvl;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -110,8 +116,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[j] += alpha * temp2;
@@ -144,8 +150,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -155,8 +161,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[jy] += alpha * temp2;
@@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         ix += inc_xv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[j] += alpha * temp2;
@@ -241,8 +247,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         ix += inc_xv;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < m){
                                         gvl = VSETVL(m-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -252,8 +258,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
 			}
                         y[jy] += alpha * temp2;
diff --git a/kernel/riscv64/symv_U_rvv.c b/kernel/riscv64/symv_U_rvv.c
new file mode 100644
index 000000000..bcd2f6981
--- /dev/null
+++ b/kernel/riscv64/symv_U_rvv.c
@@ -0,0 +1,216 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_V_T               vfloat32m8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m8_tu
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_V_T               vfloat64m8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m8_tu
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m8
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT *a_ptr = a;
+        FLOAT_V_T_M1 v_res, v_z0;
+        size_t vl_max = VSETVL_MAX_M1, vl;
+        v_z0 = VFMVVF_FLOAT_M1(0, vl_max);
+        vl_max = VSETVL_MAX;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+        
+        BLASLONG m1 = m - offset;
+        if(inc_x == 1 && inc_y == 1)
+        {
+                a_ptr += m1 * lda;
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
+
+                        y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                }
+        }
+        else if(inc_x == 1)
+        {
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        iy = 0;
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_yv = inc_y * vl;
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLEV_FLOAT(&x[i], vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
+
+                        y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                        jy    += inc_y;
+                }
+        }
+        else if(inc_y == 1)
+        {
+                jx = m1 * inc_x;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        ix = 0;
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+
+                                vy = VLEV_FLOAT(&y[i], vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSEV_FLOAT(&y[i], vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+
+                                ix += inc_xv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
+
+                        y[j] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                        jx    += inc_x;
+                }
+        }
+        else
+        {
+                jx = m1 * inc_x;
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+
+                        ix = 0;
+                        iy = 0;
+                        i = 0;
+                        vr = VFMVVF_FLOAT(0, vl_max);
+                        for (k = j; k > 0; k -= vl, i += vl)
+                        {
+                                vl = VSETVL(k);
+                                inc_xv = inc_x * vl;
+                                inc_yv = inc_y * vl;
+                                vy = VLSEV_FLOAT(&y[iy], stride_y, vl);
+                                va = VLEV_FLOAT(&a_ptr[i], vl);
+                                vy = VFMACCVF_FLOAT(vy, temp1, va, vl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy, vl);
+
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, vl);
+                                vr = VFMACCVV_FLOAT_TU(vr, vx, va, vl);
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        v_res = VFREDSUM_FLOAT(vr, v_z0, vl_max);
+
+                        y[jy] += temp1 * a_ptr[j] + alpha * VFMVFS_FLOAT_M1(v_res);
+                        a_ptr += lda;
+                        jx    += inc_x;
+                        jy    += inc_y;
+                }
+        }
+        return(0);
+}
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
index 31104eae6..894c6a643 100644
--- a/kernel/riscv64/symv_U_vector.c
+++ b/kernel/riscv64/symv_U_vector.c
@@ -27,39 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f32m4_f32m1(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) vfredusum_vs_f64m4_f64m1(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -101,8 +107,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         i += gvl;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -112,8 +118,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[j] += temp1 * a_ptr[j] + alpha * temp2;
@@ -145,8 +151,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -156,8 +162,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLEV_FLOAT(&x[i], gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[jy] += temp1 * a_ptr[j] + alpha * temp2;
@@ -190,8 +196,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         i += gvl;
                                         ix += inc_xv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLEV_FLOAT(&y[i], gvl);
@@ -201,8 +207,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[j] += temp1 * a_ptr[j] + alpha * temp2;
@@ -240,8 +246,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
                                         ix += inc_xv;
                                         iy += inc_yv;
                                 }
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                temp2 = VFMVFS_FLOAT(v_res);
+                                v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                temp2 = EXTRACT_FLOAT(v_res);
                                 if(i < j){
                                         gvl = VSETVL(j-i);
                                         vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
@@ -251,8 +257,8 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOA
 
                                         vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                                         vr = VFMULVV_FLOAT(vx, va, gvl);
-                                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                        temp2 += VFMVFS_FLOAT(v_res);
+                                        v_res = VFREDSUM_FLOAT(vr, v_z0, gvl);
+                                        temp2 += EXTRACT_FLOAT(v_res);
                                 }
                         }
                         y[jy] += temp1 * a_ptr[j] + alpha * temp2;
diff --git a/kernel/riscv64/trmm_lncopy_rvv_v1.c b/kernel/riscv64/trmm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..4135a9b62
--- /dev/null
+++ b/kernel/riscv64/trmm_lncopy_rvv_v1.c
@@ -0,0 +1,138 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_lncopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+    
+    FLOAT_V_T vb, va1;
+
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY + posX * lda;
+        } 
+        else 
+        {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                VSEV_FLOAT(b, va1, vl);
+
+                ao ++;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else if (X < posY) 
+            {
+                ao += lda;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao++;
+                    b += vl;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/trmm_ltcopy_rvv_v1.c b/kernel/riscv64/trmm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..580714fde
--- /dev/null
+++ b/kernel/riscv64/trmm_ltcopy_rvv_v1.c
@@ -0,0 +1,134 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_ltcopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+    
+    FLOAT_V_T vb, va1;
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY + posX * lda;
+        } 
+        else 
+        {
+            ao = a + posX + posY * lda;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                ao ++;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else if (X < posY) 
+            {
+                va1 = VLEV_FLOAT(ao, vl);
+                VSEV_FLOAT(b, va1, vl);
+
+                ao += lda;
+                b += vl;
+                X ++;
+                i ++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao += lda;
+                    b += vl;
+                }
+                X += vl;
+                i += vl;
+
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/trmm_uncopy_rvv_v1.c b/kernel/riscv64/trmm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..852ab7f11
--- /dev/null
+++ b/kernel/riscv64/trmm_uncopy_rvv_v1.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_uncopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+    BLASLONG stride_lda = sizeof(FLOAT) * lda;
+    FLOAT *ao;
+
+    FLOAT_V_T vb, va1;
+    size_t vl;
+
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX + posY * lda;
+        } 
+        else 
+        {
+            ao = a + posY + posX * lda;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                VSEV_FLOAT(b, va1, vl);
+
+                ao ++;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else if (X > posY) 
+            {
+                ao += lda;
+                b += vl;
+                X ++;
+                i ++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao++;
+                    b += vl;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/trmm_utcopy_rvv_v1.c b/kernel/riscv64/trmm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..e0b6d362d
--- /dev/null
+++ b/kernel/riscv64/trmm_utcopy_rvv_v1.c
@@ -0,0 +1,133 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+// Optimizes the implementation in ../arm64/tmmm_utcopy_sve_v1.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, j, js, X;
+
+    FLOAT *ao;
+    FLOAT_V_T vb, va1;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX + posY * lda;
+        } 
+        else 
+        {
+            ao = a + posY + posX * lda;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                ao ++;
+                b += vl;
+                X ++;
+                i++;
+            }
+            else if (X > posY)
+            {
+                va1 = VLEV_FLOAT(ao, vl);
+                VSEV_FLOAT(b, va1, vl);
+                ao += lda;
+                b += vl;
+                X++;
+                i++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (j = 0; j < vl; j++) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    vb = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    vb =  VFMERGE_VFM_FLOAT(vb, ONE, vbool_eq, vl);
+#endif
+                    VSEV_FLOAT(b, vb, vl);
+                    ao += lda;
+                    b += vl;
+                }
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+        posY += vl;
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/trmmkernel_rvv_v1x8.c b/kernel/riscv64/trmmkernel_rvv_v1x8.c
new file mode 100644
index 000000000..393b24bce
--- /dev/null
+++ b/kernel/riscv64/trmmkernel_rvv_v1x8.c
@@ -0,0 +1,685 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+
+// Optimizes the implementation in ../generic/trmmkernel_8x8.c
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+    //fprintf(stderr, "%s, %s, bm=%4ld bn=%4ld bk=%4ld alpha=%f ldc=%ld\n", __FILE__, __FUNCTION__, bm, bn, bk, alpha, ldc);
+
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    size_t vl;
+
+    BLASLONG off, temp;
+
+#if !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+    for (j = bn/8; j > 0; j--)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+        C4 = C3+ldc;
+        C5 = C4+ldc;
+        C6 = C5+ldc;
+        C7 = C6+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*8;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+8;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+                ptrbb += 8;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va1, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va1, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va1, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va1, vl);
+                ptrbb += 8;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va2, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va2, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va2, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va2, vl);
+                ptrbb += 8;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va3, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va3, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va3, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va3, vl);
+                ptrbb += 8;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va4, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va4, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va4, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va4, vl);
+                ptrbb += 8;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va5, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va5, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va5, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va5, vl);
+                ptrbb += 8;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va6, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va6, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va6, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va6, vl);
+                ptrbb += 8;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va7, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va7, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va7, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va7, vl);
+                ptrbb += 8;
+            }
+
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl); // M:8 (should be vlen);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                vres4 = VFMACCVF_FLOAT(vres4, *(ptrbb + 4), va0, vl);
+                vres5 = VFMACCVF_FLOAT(vres5, *(ptrbb + 5), va0, vl);
+                vres6 = VFMACCVF_FLOAT(vres6, *(ptrbb + 6), va0, vl);
+                vres7 = VFMACCVF_FLOAT(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+                ptrba += vl;
+            }
+
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VFMULVF_FLOAT(vres1, alpha, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VFMULVF_FLOAT(vres2, alpha, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VFMULVF_FLOAT(vres3, alpha, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+            va4 = VFMULVF_FLOAT(vres4, alpha, vl);
+            VSEV_FLOAT(C4, va4, vl);
+
+            va5 = VFMULVF_FLOAT(vres5, alpha, vl);
+            VSEV_FLOAT(C5, va5, vl);
+
+            va6 = VFMULVF_FLOAT(vres6, alpha, vl);
+            VSEV_FLOAT(C6, va6, vl);
+
+            va7 = VFMULVF_FLOAT(vres7, alpha, vl);
+            VSEV_FLOAT(C7, va7, vl);
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+            C4 += vl;
+            C5 += vl;
+            C6 += vl;
+            C7 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 8;
+#endif
+
+        bb += (bk<<3);
+        C += (ldc<<3);
+    }
+
+    if (bn & 4)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*4;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+4;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+                ptrbb += 4;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va1, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va1, vl);
+                ptrbb += 4;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va2, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va2, vl);
+                ptrbb += 4;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va3, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va3, vl);
+                ptrbb += 4;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va4, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va4, vl);
+                ptrbb += 4;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va5, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va5, vl);
+                ptrbb += 4;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va6, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va6, vl);
+                ptrbb += 4;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va7, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va7, vl);
+                ptrbb += 4;
+            }
+
+            // K remainder
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                vres2 = VFMACCVF_FLOAT(vres2, *(ptrbb + 2), va0, vl);
+                vres3 = VFMACCVF_FLOAT(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+                ptrba += vl;
+            }
+
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VFMULVF_FLOAT(vres1, alpha, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+            va2 = VFMULVF_FLOAT(vres2, alpha, vl);
+            VSEV_FLOAT(C2, va2, vl);
+
+            va3 = VFMULVF_FLOAT(vres3, alpha, vl);
+            VSEV_FLOAT(C3, va3, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+            C1 += vl;
+            C2 += vl;
+            C3 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        bb += (bk<<2);
+        C += (ldc<<2);
+    }
+
+    if (bn & 2)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl) 
+        {
+            vl = VSETVL(i);
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+2;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va1, vl);
+                ptrbb += 2;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va2, vl);
+                ptrbb += 2;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va3, vl);
+                ptrbb += 2;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va4, vl);
+                ptrbb += 2;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va5, vl);
+                ptrbb += 2;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va6, vl);
+                ptrbb += 2;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va7, vl);
+                ptrbb += 2;
+            }
+
+            // K remainder
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                vres1 = VFMACCVF_FLOAT(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+                ptrba += vl;
+            }
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+            va1 = VFMULVF_FLOAT(vres1, alpha, vl);
+            VSEV_FLOAT(C1, va1, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+            C1 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        bb += (bk<<1);
+        C += (ldc<<1);
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl;
+            ptrbb = bb + off*1;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+1;   // number of values in B
+#endif
+
+            for (k = temp/8; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+                va1 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+                ptrbb += 1;
+                va2 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va1, vl);
+                ptrbb += 1;
+                va3 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va2, vl);
+                ptrbb += 1;
+                va4 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va3, vl);
+                ptrbb += 1;
+                va5 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va4, vl);
+                ptrbb += 1;
+                va6 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va5, vl);
+                ptrbb += 1;
+                va7 = VLEV_FLOAT(ptrba, vl);
+                ptrba += vl;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va6, vl);
+                ptrbb += 1;
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va7, vl);
+                ptrbb += 1;
+            }
+
+            // K remainder
+            for (k = temp&7; k > 0; k--) {
+                va0 = VLEV_FLOAT(ptrba, vl);
+
+                vres0 = VFMACCVF_FLOAT(vres0, *(ptrbb + 0), va0, vl);
+
+                ptrbb += 1;
+                ptrba += vl;
+            }
+            va0 = VFMULVF_FLOAT(vres0, alpha, vl);
+            VSEV_FLOAT(C0, va0, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*vl;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        bb += (bk);
+        C += (ldc);
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/trsm_kernel_LN_rvv_v1.c b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
new file mode 100644
index 000000000..869561fb3
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_LN_rvv_v1.c
@@ -0,0 +1,364 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_LN_sve.c
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+    FLOAT aa;
+    FLOAT* pc;
+
+    int i, j, k;
+
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+
+    FLOAT_V_T vb, vc;
+
+    size_t vl;
+
+    a += (m - 1) * m;
+    b += (m - 1) * n;
+
+    for (i = m - 1; i >= 0; i--) {
+
+        aa = *(a + i);
+        pc  = c;
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
+            vb = VFMULVF_FLOAT(vb, aa, vl);
+            VSEV_FLOAT(b, vb, vl);
+            VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
+            b   += vl;
+
+            for (k = 0; k < i; k ++) {
+                vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
+                VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
+            }
+            pc  += vl * ldc;
+        }
+        a -= m;
+        b -= 2 * n;
+    }
+
+}
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa1, aa2;
+    FLOAT *pc;
+    int i, j, k;
+
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
+
+    FLOAT_VX2_T vbx2, vsx2, vcx2;
+    FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
+    size_t vl;
+    a += (m - 1) * m * 2;
+    b += (m - 1) * n * 2;
+
+    for (i = m - 1; i >= 0; i--) {
+
+        aa1 = *(a + i * 2 + 0);
+        aa2 = *(a + i * 2 + 1);
+        pc  = c;
+
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl);
+            vb1 = VGET_VX2(vbx2, 0);
+            vb2 = VGET_VX2(vbx2, 1);
+#ifndef CONJ
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
+#else
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
+#endif
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(b, vsx2, vl);
+            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl);
+            b   += vl * 2;
+
+            for (k = 0; k < i; k ++) {
+                vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
+#ifndef CONJ
+                vc1 =  VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#else                                                        
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#endif
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl);
+            }
+            pc  += vl * ldc * 2;
+        }
+        a -= m * 2;
+        b -= 4 * n;
+    }
+}
+
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  
+  size_t vl = VSETVL_MAX;
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = m + offset;
+
+    i = m % vl;
+    if (i) {
+      aa = a + (m - i) * k * COMPSIZE;
+      cc = c + (m - i)     * COMPSIZE;
+
+      if (k - kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa + i             * kk * COMPSIZE,
+            b  + GEMM_UNROLL_N * kk * COMPSIZE,
+            cc,
+            ldc);
+      }
+
+      solve(i, GEMM_UNROLL_N,
+          aa + (kk - i) * i             * COMPSIZE,
+          b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      kk -= i;
+
+    }
+
+    int mod = i;
+    i = vl;
+    if (i <= m) {
+      aa = a + (m - mod - vl) * k * COMPSIZE;
+      cc = c + (m - mod - vl)     * COMPSIZE;
+
+      do {
+        if (k - kk > 0) {
+          GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+              ZERO,
+#endif
+              aa + vl * kk * COMPSIZE,
+              b +  GEMM_UNROLL_N * kk * COMPSIZE,
+              cc,
+              ldc);
+        }
+
+        solve(vl, GEMM_UNROLL_N,
+            aa + (kk - vl) * vl * COMPSIZE,
+            b  + (kk - vl) * GEMM_UNROLL_N * COMPSIZE,
+            cc, ldc);
+
+        aa -= vl * k * COMPSIZE;
+        cc -= vl     * COMPSIZE;
+        kk -= vl;
+
+        i += vl;
+      } while (i <= m);
+    }
+
+
+    b += GEMM_UNROLL_N * k * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = m + offset;
+
+        i = m % vl;
+        if (i) {
+          aa = a + (m - i) * k * COMPSIZE;
+          cc = c + (m - i)     * COMPSIZE;
+
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - i) * i * COMPSIZE,
+              b  + (kk - i) * j * COMPSIZE,
+              cc, ldc);
+
+          kk -= i;
+
+        }
+
+        int mod = i;
+        i = vl;
+        if (i <= m) {
+          aa = a + (m - mod - vl) * k * COMPSIZE;
+          cc = c + (m - mod - vl)     * COMPSIZE;
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(vl, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + vl * kk * COMPSIZE,
+                  b +  j             * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(vl, j,
+                aa + (kk - vl) * vl * COMPSIZE,
+                b  + (kk - vl) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa -= vl * k * COMPSIZE;
+            cc -= vl     * COMPSIZE;
+            kk -= vl;
+
+            i += vl;
+          } while (i <= m);
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/trsm_kernel_LT_rvv_v1.c b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
new file mode 100644
index 000000000..da443cfba
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_LT_rvv_v1.c
@@ -0,0 +1,341 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e32_v_f32m2x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSSEG2_FLOAT           __riscv_vssseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_LT_sve.c
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa;
+    FLOAT* pc;
+
+    int i, j, k;
+
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc;
+
+    FLOAT_V_T vb, vc;
+
+    size_t vl;
+
+    for (i = 0; i < m; i++) {
+
+        aa = *(a + i);
+        pc  = c;
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vb = VLSEV_FLOAT(pc + i, stride_ldc, vl);
+            vb = VFMULVF_FLOAT(vb, aa, vl);
+            VSEV_FLOAT(b, vb, vl);
+            VSSEV_FLOAT(pc + i, stride_ldc, vb, vl);
+            b   += vl;
+
+            for (k = i + 1; k < m; k++) {
+                vc = VLSEV_FLOAT(pc + k, stride_ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(a + k), vb, vl);
+                VSSEV_FLOAT(pc + k, stride_ldc, vc, vl);
+            }
+            pc  += vl * ldc;
+        }
+        a += m;
+    }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT aa1, aa2;
+    FLOAT *pc;
+    int i, j, k;
+
+    BLASLONG stride_ldc = sizeof(FLOAT) * ldc * 2;
+
+    FLOAT_VX2_T vbx2, vsx2, vcx2;
+    FLOAT_V_T vb1, vb2, vc1, vc2, vs1, vs2;
+    size_t vl;
+
+    ldc *= 2;
+
+    for (i = 0; i < m; i++) {
+        aa1 = *(a + i * 2 + 0);
+        aa2 = *(a + i * 2 + 1);
+        pc  = c;
+
+        for (j = n; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vbx2 = VLSSEG2_FLOAT(pc + i * 2, stride_ldc, vl);
+            vb1 = VGET_VX2(vbx2, 0);
+            vb2 = VGET_VX2(vbx2, 1);
+#ifndef CONJ
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, aa2, vb1, vl);
+#else
+            vs1 =   VFMULVF_FLOAT(vb1, aa1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, aa2, vb2, vl);
+            vs2 =   VFMULVF_FLOAT(vb2, aa1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, aa2, vb1, vl);
+#endif
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(b, vsx2, vl);
+            VSSSEG2_FLOAT(pc + i * 2, stride_ldc, vsx2, vl);
+            b   += vl * 2;
+
+            for (k = i + 1; k < m; k++) {
+                vcx2 = VLSSEG2_FLOAT(pc + k * 2, stride_ldc, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
+#ifndef CONJ
+                vc1 =  VFMACCVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#else                                                        
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(a + k * 2 + 0), vs1, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(a + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(a + k * 2 + 0), vs2, vl);
+#endif
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSSEG2_FLOAT(pc + k * 2, stride_ldc, vcx2, vl);
+            }
+            pc  += vl * ldc * 2;
+        }
+
+        a += m * 2;
+    }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j;
+
+  size_t vl = VSETVL_MAX;
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = offset;
+    aa = a;
+    cc = c;
+
+    i = vl;
+
+    while (i <= m) {
+
+      if (kk > 0) {
+        GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+
+      solve(vl, GEMM_UNROLL_N,
+          aa + kk * vl * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += vl * k * COMPSIZE;
+      cc += vl     * COMPSIZE;
+      kk += vl;
+      i += vl;
+    }
+
+    i = m % vl;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+      kk += i;
+
+    }
+
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+        kk = offset;
+        aa = a;
+        cc = c;
+
+        i = vl;
+
+        while (i <= m) {
+          if (kk > 0) {
+            GEMM_KERNEL(vl, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(vl, j,
+              aa + kk * vl * COMPSIZE,
+              b  + kk * j             * COMPSIZE, cc, ldc);
+
+          aa += vl * k * COMPSIZE;
+          cc += vl     * COMPSIZE;
+          kk += vl;
+          i += vl;
+        }
+
+        i = m % vl;
+        if (i) {
+          if (kk > 0) {
+            GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa,
+                b,
+                cc,
+                ldc);
+          }
+
+          solve(i, j,
+              aa + kk * i * COMPSIZE,
+              b  + kk * j * COMPSIZE, cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+          kk += i;
+
+        }
+
+        b += j * k   * COMPSIZE;
+        c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/trsm_kernel_RN_rvv_v1.c b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
new file mode 100644
index 000000000..32e481036
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_RN_rvv_v1.c
@@ -0,0 +1,337 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_RN_sve.c
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb;
+    FLOAT *pci, *pcj;
+
+    int i, j, k;
+    FLOAT_V_T va, vc;
+
+    size_t vl;
+    for (i = 0; i < n; i++) {
+
+        bb = *(b + i);
+        pci = c + i * ldc;
+        pcj = c;
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            va = VLEV_FLOAT(pci, vl);
+            va = VFMULVF_FLOAT(va, bb, vl);
+            VSEV_FLOAT(a, va, vl);
+            VSEV_FLOAT(pci, va, vl);
+            a   += vl;
+            pci += vl;
+            for (k = i + 1; k < n; k ++){
+                vc = VLEV_FLOAT(pcj + k * ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
+                VSEV_FLOAT(pcj + k * ldc, vc, vl);
+            }
+            pcj += vl;
+        }
+        b += n;
+    }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb1, bb2;
+
+    FLOAT *pci, *pcj;
+
+    int i, j, k;
+
+    FLOAT_VX2_T vax2, vsx2, vcx2;
+    FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
+
+    size_t vl;
+
+    for (i = 0; i < n; i++) {
+
+        bb1 = *(b + i * 2 + 0);
+        bb2 = *(b + i * 2 + 1);
+
+        pci = c + i * ldc * 2;
+        pcj = c;
+
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vax2 = VLSEG2_FLOAT(pci, vl);
+            va1 = VGET_VX2(vax2, 0);
+            va2 = VGET_VX2(vax2, 1);
+#ifndef CONJ
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va1, bb2, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, bb1, va2, vl);
+#else
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va2, bb1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
+#endif
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(a, vsx2, vl);
+            VSSEG2_FLOAT(pci, vsx2, vl);
+            a += vl * 2;
+            pci += vl * 2;
+
+            for (k = i + 1; k < n; k ++){
+                vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
+#ifndef CONJ
+                vc1 =  VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
+#else
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
+#endif
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl);
+            }
+            pcj += vl * 2;
+        }
+        b += n * 2;
+    }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j;
+
+  size_t vl = VSETVL_MAX;
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+  kk = -offset;
+
+  while (j > 0) {
+
+    aa = a;
+    cc = c;
+
+    i = vl;
+
+    if (i <= m) {
+      do {
+	if (kk > 0) {
+	  GEMM_KERNEL(vl, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+	solve(vl, GEMM_UNROLL_N,
+	      aa + kk * vl * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+
+	aa += vl * k * COMPSIZE;
+	cc += vl     * COMPSIZE;
+	i += vl;
+      } while (i <= m);
+    }
+
+
+    i = m % vl;
+    if (i) {
+      if (kk > 0) {
+        GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+            ZERO,
+#endif
+            aa, b, cc, ldc);
+      }
+      solve(i, GEMM_UNROLL_N,
+          aa + kk * i             * COMPSIZE,
+          b  + kk * GEMM_UNROLL_N * COMPSIZE,
+          cc, ldc);
+
+      aa += i * k * COMPSIZE;
+      cc += i     * COMPSIZE;
+
+    }
+
+    kk += GEMM_UNROLL_N;
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	aa = a;
+	cc = c;
+
+  i = vl;
+
+	while (i <= m) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(vl, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(vl, j,
+		aa + kk * vl * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += vl * k * COMPSIZE;
+	  cc += vl     * COMPSIZE;
+	  i += vl;
+	}
+
+  i = m % vl;
+  if (i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+
+  }
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+	kk += j;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/riscv64/trsm_kernel_RT_rvv_v1.c b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
new file mode 100644
index 000000000..81cc41818
--- /dev/null
+++ b/kernel/riscv64/trsm_kernel_RT_rvv_v1.c
@@ -0,0 +1,356 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_kernel_RT_sve.c
+
+#ifndef COMPLEX
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb;
+    FLOAT *pci, *pcj;
+
+    int i, j, k;
+    FLOAT_V_T va, vc;
+
+    size_t vl;
+
+    a += (n - 1) * m;
+    b += (n - 1) * n;
+
+    for (i = n - 1; i >= 0; i--) {
+
+        bb = *(b + i);
+        pci = c + i * ldc;
+        pcj = c;
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            va = VLEV_FLOAT(pci, vl);
+            va = VFMULVF_FLOAT(va, bb, vl);
+            VSEV_FLOAT(a, va, vl);
+            VSEV_FLOAT(pci, va, vl);
+            a   += vl;
+            pci += vl;
+            for (k = 0; k < i; k ++){
+                vc = VLEV_FLOAT(pcj + k * ldc, vl);
+                vc = VFNMSACVF_FLOAT(vc, *(b + k), va, vl);
+                VSEV_FLOAT(pcj + k * ldc, vc, vl);
+            }
+            pcj += vl;
+        }
+        b -= n;
+        a -= 2 * m;
+    }
+}
+
+#else
+
+static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+    FLOAT bb1, bb2;
+
+    FLOAT *pci, *pcj;
+
+    int i, j, k;
+
+    FLOAT_VX2_T vax2, vsx2, vcx2;
+    FLOAT_V_T va1, va2, vs1, vs2, vc1, vc2;
+
+    size_t vl;
+
+    a += (n - 1) * m * 2;
+    b += (n - 1) * n * 2;
+
+    for (i = n - 1; i >= 0; i--) {
+
+        bb1 = *(b + i * 2 + 0);
+        bb2 = *(b + i * 2 + 1);
+
+        pci = c + i * ldc * 2;
+        pcj = c;
+        for (j = m; j > 0; j -= vl) {
+            vl = VSETVL(j);
+            vax2 = VLSEG2_FLOAT(pci, vl);
+            va1 = VGET_VX2(vax2, 0);
+            va2 = VGET_VX2(vax2, 1);
+#ifndef CONJ
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 = VFNMSACVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va1, bb2, vl);
+            vs2 =  VFMACCVF_FLOAT(vs2, bb1, va2, vl);
+#else
+            vs1 =   VFMULVF_FLOAT(va1, bb1, vl);
+            vs1 =  VFMACCVF_FLOAT(vs1, bb2, va2, vl);
+            vs2 =   VFMULVF_FLOAT(va2, bb1, vl);
+            vs2 = VFNMSACVF_FLOAT(vs2, bb2, va1, vl);
+#endif
+            vsx2 = VSET_VX2(vsx2, 0, vs1);
+            vsx2 = VSET_VX2(vsx2, 1, vs2);
+            VSSEG2_FLOAT(a, vsx2, vl);
+            VSSEG2_FLOAT(pci, vsx2, vl);
+            a += vl * 2;
+            pci += vl * 2;
+
+            for (k = 0; k < i; k ++){
+                vcx2 = VLSEG2_FLOAT(pcj + k * ldc * 2, vl);
+                vc1 = VGET_VX2(vcx2, 0);
+                vc2 = VGET_VX2(vcx2, 1);
+#ifndef CONJ
+                vc1 =  VFMACCVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
+#else
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 0), vs1, vl);
+                vc1 = VFNMSACVF_FLOAT(vc1, *(b + k * 2 + 1), vs2, vl);
+                vc2 =  VFMACCVF_FLOAT(vc2, *(b + k * 2 + 1), vs1, vl);
+                vc2 = VFNMSACVF_FLOAT(vc2, *(b + k * 2 + 0), vs2, vl);
+#endif
+                vcx2 = VSET_VX2(vcx2, 0, vc1);
+                vcx2 = VSET_VX2(vcx2, 1, vc2);
+                VSSEG2_FLOAT(pcj + k * ldc * 2, vcx2, vl);
+            }
+            pcj += vl * 2;
+        }
+        b -= n * 2;
+        a -= 4 * m;
+    }
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+    FLOAT dummy2,
+#endif
+    FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+
+  size_t vl = VSETVL_MAX;
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  k = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, k, offset); // Debug
+
+  kk = n - offset;
+  c += n * ldc * COMPSIZE;
+  b += n * k   * COMPSIZE;
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = 1;
+    while (j < GEMM_UNROLL_N) {
+      if (n & j) {
+
+        aa  = a;
+        b -= j * k  * COMPSIZE;
+        c -= j * ldc* COMPSIZE;
+        cc  = c;
+
+        i = vl;
+        if (i <= m) {
+
+          do {
+            if (k - kk > 0) {
+              GEMM_KERNEL(vl, j, k - kk, dm1,
+#ifdef COMPLEX
+                  ZERO,
+#endif
+                  aa + vl * kk * COMPSIZE,
+                  b  +  j            * kk * COMPSIZE,
+                  cc,
+                  ldc);
+            }
+
+            solve(vl, j,
+                aa + (kk - j) * vl * COMPSIZE,
+                b  + (kk - j) * j             * COMPSIZE,
+                cc, ldc);
+
+            aa += vl * k * COMPSIZE;
+            cc += vl     * COMPSIZE;
+            i += vl;
+          } while (i <= m);
+        }
+
+        i = m % vl;
+        if (i) {
+          if (k - kk > 0) {
+            GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+                ZERO,
+#endif
+                aa + i * kk * COMPSIZE,
+                b  + j * kk * COMPSIZE,
+                cc, ldc);
+          }
+
+          solve(i, j,
+              aa + (kk - j) * i * COMPSIZE,
+              b  + (kk - j) * j * COMPSIZE,
+              cc, ldc);
+
+          aa += i * k * COMPSIZE;
+          cc += i     * COMPSIZE;
+
+        }
+        kk -= j;
+      }
+      j <<= 1;
+    }
+  }
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  if (j > 0) {
+
+    do {
+      aa  = a;
+      b -= GEMM_UNROLL_N * k   * COMPSIZE;
+      c -= GEMM_UNROLL_N * ldc * COMPSIZE;
+      cc  = c;
+
+      i = vl;
+      if (i <= m) {
+	do {
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(vl, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + vl * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  solve(vl, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * vl * COMPSIZE,
+		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += vl * k * COMPSIZE;
+	  cc += vl     * COMPSIZE;
+	  i += vl;
+	} while (i <= m);
+      }
+
+      i = m % vl;
+      if (i) {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + i             * kk * COMPSIZE,
+			  b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(i, GEMM_UNROLL_N,
+		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		  cc, ldc);
+
+	    aa += i * k * COMPSIZE;
+	    cc += i     * COMPSIZE;
+
+      }
+
+      kk -= GEMM_UNROLL_N;
+      j --;
+    } while (j > 0);
+  }
+
+  return 0;
+}
+
+
diff --git a/kernel/riscv64/trsm_lncopy_rvv_v1.c b/kernel/riscv64/trsm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..41c84be25
--- /dev/null
+++ b/kernel/riscv64/trsm_lncopy_rvv_v1.c
@@ -0,0 +1,122 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+
+#endif
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_lncopy_sve.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+
+    FLOAT_V_T va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+
+                    *(b + j) = INV(*(ao + j * lda));
+                    ao++;
+                    b += vl;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else
+            {
+                if (ii > jj)
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao++;
+                b += vl;
+                i++;
+                ii++;
+            }
+        }
+
+        a += vl * lda;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/trsm_ltcopy_rvv_v1.c b/kernel/riscv64/trsm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..003bd3465
--- /dev/null
+++ b/kernel/riscv64/trsm_ltcopy_rvv_v1.c
@@ -0,0 +1,122 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#endif
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_ltcopy_sve.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    FLOAT_V_T va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    *(b + j) = INV(*(ao + j));
+
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+
+                    b += vl;
+                    ao += lda;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else 
+            {
+                if (ii < jj) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao += lda;
+                b += vl;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl;
+        jj += vl;
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/trsm_uncopy_rvv_v1.c b/kernel/riscv64/trsm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..6cca5d49c
--- /dev/null
+++ b/kernel/riscv64/trsm_uncopy_rvv_v1.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#endif
+
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_uncopy_sve.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+    BLASLONG stride_lda = sizeof(FLOAT)*lda;
+
+    FLOAT *ao;
+    jj = offset;
+
+    FLOAT_V_T va1;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        i = 0;
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    *(b + j) = INV(*(ao + j * lda));
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+                    ao++;
+                    b += vl;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else
+            {
+                if (ii < jj) 
+                {
+                    va1 = VLSEV_FLOAT(ao, stride_lda, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao++;
+                b += vl;
+                i++;
+                ii++;
+            }
+        } 
+
+        a += vl * lda;
+        jj += vl;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/trsm_utcopy_rvv_v1.c b/kernel/riscv64/trsm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..bc058525f
--- /dev/null
+++ b/kernel/riscv64/trsm_utcopy_rvv_v1.c
@@ -0,0 +1,123 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VSEV_FLOAT_M            __riscv_vse32_v_f32m2_m
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VSEV_FLOAT_M            __riscv_vse64_v_f64m2_m
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#endif
+
+
+#ifndef UNIT
+#define INV(a) (ONE / (a))
+#else
+#define INV(a) (ONE)
+#endif
+
+// Optimizes the implementation in ../arm64/trsm_utcopy_sve.c
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+    FLOAT_V_T va1;
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+  
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSEV_FLOAT_M(vbool_cmp, b, va1, vl);
+                    *(b + j) = INV(*(ao + j));
+
+                    ao += lda;
+                    b += vl;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else 
+            {
+                if (ii > jj) 
+                {
+                    va1 = VLEV_FLOAT(ao, vl);
+                    VSEV_FLOAT(b, va1, vl);
+                }
+                ao += lda;
+                b += vl;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zamax_rvv.c b/kernel/riscv64/zamax_rvv.c
new file mode 100644
index 000000000..180cf059a
--- /dev/null
+++ b/kernel/riscv64/zamax_rvv.c
@@ -0,0 +1,124 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f32m4_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f32m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDMAXVS_FLOAT        __riscv_vfredmax_vs_f64m4_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMAXVV_FLOAT_TU        __riscv_vfmax_vv_f64m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT maxf=0.0;
+
+    if (n <= 0 || inc_x <= 0) return(maxf);
+
+    FLOAT_V_T v0, v1, vmax;
+    FLOAT_V_T_M1 v_res;
+    FLOAT_VX2_T vx2;
+
+    v_res = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmax = VFMVVF_FLOAT(0.0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSEG_FLOAT(x, vl);
+            
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl);
+  
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmax = VFMAXVV_FLOAT_TU(vmax, vmax, v0, vl);
+        }
+
+    }
+
+    v_res = VFREDMAXVS_FLOAT(vmax, v_res, vlmax);
+    maxf = VFMVFS_FLOAT_M1(v_res);
+
+    return(maxf);
+}
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
index 9dbeba90f..ec4a5a1e9 100644
--- a/kernel/riscv64/zamax_vector.c
+++ b/kernel/riscv64/zamax_vector.c
@@ -28,39 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
-
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMAXVV_FLOAT vfmax_vv_f64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
 
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAXVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1)) (v_res, va, vb, gvl)
+#else
+#define VFREDMAXVS_FLOAT JOIN(RISCV_RVV(vfredmax_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
 #endif
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMAXVV_FLOAT   JOIN(RISCV_RVV(vfmax),     _vv_f,  ELEN,   LMUL,   _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VFABSV_FLOAT   JOIN(RISCV_RVV(vfabs),     _v_f,  ELEN,   LMUL,   _)
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
@@ -70,10 +78,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(maxf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_max;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
         MASK_T mask0, mask1;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
@@ -83,10 +89,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for(; i<n/gvl; i++){
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
@@ -94,22 +99,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 j += gvl;
                 ix += inc_xv;
         }
-        v_res = VFREDMAXVS_FLOAT(v_res, v_max, v_z0, gvl);
-        maxf = VFMVFS_FLOAT(v_res);
+        v_res = VFREDMAXVS_FLOAT(v_max, v_res, gvl);
 
         if(j<n){
                 gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
-                v_res = VFREDMAXVS_FLOAT(v_res, v1, v_z0, gvl);
-                             
-                if(VFMVFS_FLOAT(v_res)> maxf)
-                        maxf = VFMVFS_FLOAT(v_res);
+                v_res = VFREDMAXVS_FLOAT(v1, v_res, gvl);
         }
+        maxf = EXTRACT_FLOAT(v_res);
         return(maxf);
 }
diff --git a/kernel/riscv64/zamin_rvv.c b/kernel/riscv64/zamin_rvv.c
new file mode 100644
index 000000000..56a467502
--- /dev/null
+++ b/kernel/riscv64/zamin_rvv.c
@@ -0,0 +1,123 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f32m4_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f32m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDMINVS_FLOAT        __riscv_vfredmin_vs_f64m4_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMINVV_FLOAT_TU        __riscv_vfmin_vv_f64m4_tu
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT minf=0.0;
+
+    if (n <= 0 || inc_x <= 0) return(minf);
+
+    FLOAT_V_T v0, v1, vmin;
+    FLOAT_V_T_M1 v_res;
+    FLOAT_VX2_T vx2;
+
+    v_res = VFMVVF_FLOAT_M1(FLT_MAX, VSETVL_MAX_M1);
+    size_t vlmax = VSETVL_MAX;
+    vmin = VFMVVF_FLOAT(FLT_MAX, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSEG_FLOAT(x, vl);
+            
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v0 = VFADDVV_FLOAT(v0, v1, vl);
+            vmin = VFMINVV_FLOAT_TU(vmin, vmin, v0, vl);
+        }
+
+    }
+
+    v_res = VFREDMINVS_FLOAT(vmin, v_res, vlmax);
+    minf = VFMVFS_FLOAT_M1(v_res);
+
+    return(minf);
+}
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
index dc58075ac..45b3e0b9d 100644
--- a/kernel/riscv64/zamin_vector.c
+++ b/kernel/riscv64/zamin_vector.c
@@ -29,38 +29,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <math.h>
 #include <float.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFMINVV_FLOAT vfmin_vv_f32m8
-#define VFADDVV_FLOAT vfadd_vv_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 32
+#       else
+#               define ELEN 32
+#               define MLEN 16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDMINVS_FLOAT vfredmin_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFMINVV_FLOAT vfmin_vv_f64m8
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 8
+#       else
+#               define ELEN 32
+#               define MLEN 4
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMINVS_FLOAT(va,vb,gvl) JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1)) (v_res, va, vb, gvl)
+#else
+#define VFREDMINVS_FLOAT JOIN(RISCV_RVV(vfredmin_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFMINVV_FLOAT   JOIN(RISCV_RVV(vfmin),     _vv_f,  ELEN,   LMUL,   _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VFABSV_FLOAT   JOIN(RISCV_RVV(vfabs),     _v_f,  ELEN,   LMUL,   _)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -69,10 +80,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=FLT_MAX;
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_min;
-        FLOAT_V_T_M1 v_res, v_max;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_max = VFMVVF_FLOAT_M1(FLT_MAX, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(FLT_MAX, 1);
 
         MASK_T mask0, mask1;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
@@ -82,10 +91,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         for(; i<n/gvl; i++){
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
 
                 v0 = VFADDVV_FLOAT(v0, v1, gvl);
                 v_min = VFMINVV_FLOAT(v_min, v0, gvl);
@@ -93,21 +101,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 j += gvl;
                 ix += inc_xv;
         }
-        v_res = VFREDMINVS_FLOAT(v_res, v_min, v_max, gvl);
-        minf = VFMVFS_FLOAT(v_res);
+        v_res = VFREDMINVS_FLOAT(v_min, v_res, gvl);
 
         if(j<n){
                 gvl = VSETVL(n-j);
                 v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                 v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                v0 = VFABSV_FLOAT(v0, gvl);
+                v1 = VFABSV_FLOAT(v1, gvl);
                 v1 = VFADDVV_FLOAT(v0, v1, gvl);
-                v_res = VFREDMINVS_FLOAT(v_res, v1, v_max, gvl);
-                if(VFMVFS_FLOAT(v_res) < minf)
-                        minf = VFMVFS_FLOAT(v_res);
+                v_res = VFREDMINVS_FLOAT(v1, v_res, gvl);
         }
+
+        minf = EXTRACT_FLOAT(v_res);
         return(minf);
 }
diff --git a/kernel/riscv64/zasum_rvv.c b/kernel/riscv64/zasum_rvv.c
new file mode 100644
index 000000000..ebec1b19c
--- /dev/null
+++ b/kernel/riscv64/zasum_rvv.c
@@ -0,0 +1,107 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m8()
+#define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m8
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f32m8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m8(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m8()
+#define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m8
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m8_tu
+#define VFABSV_FLOAT            __riscv_vfabs_v_f64m8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT asumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(asumf);
+
+    FLOAT_V_T v0, v1;
+    size_t vlmax = VSETVL_MAX; 
+    FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            v0 = VLEV_FLOAT(x, vl);
+            v1 = VLEV_FLOAT(x+vl, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
+        }
+
+    }
+    else {
+
+        int stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            v0 = VLSEV_FLOAT(x, stride_x, vl);
+            v1 = VLSEV_FLOAT(x+1, stride_x, vl);
+
+            v0 = VFABSV_FLOAT(v0, vl);
+            v1 = VFABSV_FLOAT(v1, vl);
+
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
+        }
+
+    }
+
+    FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax);
+    asumf += VFMVFS_FLOAT_M1(v_res);
+
+    return(asumf);
+}
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
index 8386ab62e..fca904d6a 100644
--- a/kernel/riscv64/zasum_vector.c
+++ b/kernel/riscv64/zasum_vector.c
@@ -28,37 +28,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #include <math.h>
 
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f32m8_f32m1
-#define MASK_T vbool4_t
-#define VMFLTVF_FLOAT vmflt_vf_f32m8_b4
-#define VFMVVF_FLOAT vfmv_v_f_f32m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m8_m
-#define VFADDVV_FLOAT vfadd_vv_f32m8
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b32
+#       else
+#               define ELEN 32
+#               define MLEN _b16
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VFREDSUMVS_FLOAT vfredusum_vs_f64m8_f64m1
-#define MASK_T vbool8_t
-#define VMFLTVF_FLOAT vmflt_vf_f64m8_b8
-#define VFMVVF_FLOAT vfmv_v_f_f64m8
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m8_m
-#define VFADDVV_FLOAT vfadd_vv_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b8
+#       else
+#               define ELEN 32
+#               define MLEN _b4
+#       endif
 #endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif
+#define VFABS_FLOAT     JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt),     _vf_f,  ELEN,   LMUL,   MLEN)
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0, j=0;
@@ -67,12 +77,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	if (n <= 0 || inc_x <= 0) return(asumf);
         unsigned int gvl = 0;
         FLOAT_V_T v0, v1, v_zero,v_sum;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
 
-        MASK_T mask0, mask1;
         if(inc_x == 1){
                 BLASLONG n2 = n * 2;
                 gvl = VSETVL(n2);
@@ -81,26 +88,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                         v_sum = VFMVVF_FLOAT(0, gvl);
                         for(i=0,j=0; i<n2/(gvl*2); i++){
                                 v0 = VLEV_FLOAT(&x[j], gvl);
-                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                                v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                                v0 = VFABS_FLOAT(v0, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                                 v1 = VLEV_FLOAT(&x[j+gvl], gvl);
-                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                                v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                                v1 = VFABS_FLOAT(v1, gvl);
                                 v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
                                 j += gvl * 2;
                         }
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += VFFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
                 for(;j<n2;){
                         gvl = VSETVL(n2-j);
                         v0 = VLEV_FLOAT(&x[j], gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v0, v_z0, gvl);
-                        asumf += VFFMVFS_FLOAT(v_res);
+                        v0 = VFABS_FLOAT(v0, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
                         j += gvl;
                 }
         }else{
@@ -112,34 +114,29 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
                 v_sum = VFMVVF_FLOAT(0, gvl);
                 for(i=0,j=0; i<n/gvl; i++){
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
+                        v0 = VFABS_FLOAT(v0, gvl);
                         v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
 
                         v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                        v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                        v1 = VFABS_FLOAT(v1, gvl);
                         v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
 
                         j += gvl;
                         ix += inc_xv;
                 }
-                v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                asumf += VFFMVFS_FLOAT(v_res);
+                v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 if(j<n){
                         gvl = VSETVL(n-j);
                         v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask0, v0, v0, 0, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v0 = VFABS_FLOAT(v0, gvl);
 
-                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
-                        v1 = VFRSUBVF_MASK_FLOAT(mask1, v1, v1, 0, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v1 = VFABS_FLOAT(v1, gvl);
                         v_sum = VFADDVV_FLOAT(v0, v1, gvl);
-                        v_res = VFREDSUMVS_FLOAT(v_res, v_sum, v_z0, gvl);
-                        asumf += VFFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
                 }
         }
+        asumf = EXTRACT_FLOAT(v_res);
 	return(asumf);
 }
 
diff --git a/kernel/riscv64/zaxpby_rvv.c b/kernel/riscv64/zaxpby_rvv.c
new file mode 100644
index 000000000..66e38c1e4
--- /dev/null
+++ b/kernel/riscv64/zaxpby_rvv.c
@@ -0,0 +1,180 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***************************************************************************
+* 2014/06/07 Saar
+*
+***************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFMSACVF_FLOAT          __riscv_vfmsac_vf_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
+{
+    BLASLONG inc_x2, inc_y2;
+
+    if ( n <= 0     )  return(0);
+
+    inc_x2 = 2 * inc_x;
+    inc_y2 = 2 * inc_y;
+    
+    BLASLONG stride_x = inc_x2 * sizeof(FLOAT);
+    BLASLONG stride_y = inc_y2 * sizeof(FLOAT);
+    FLOAT_V_T vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2;
+
+    if ( beta_r == 0.0 && beta_i == 0.0)
+    {
+        if ( alpha_r == 0.0 && alpha_i == 0.0 )
+        {
+            size_t vl = VSETVL(n);
+            FLOAT_V_T temp = VFMVVF_FLOAT(0.0, vl);
+            vxx2 = VSET_VX2(vxx2, 0, temp);
+            vxx2 = VSET_VX2(vxx2, 1, temp);
+            for ( ; n > 0; n -= vl, y += vl*inc_y2)
+            {
+                vl = VSETVL(n);
+                VSSSEG_FLOAT(y, stride_y, vxx2, vl);
+            }
+        }
+        else
+        {
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) 
+            {
+                vl = VSETVL(n);
+
+                vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
+                
+                vy0 = VFMULVF_FLOAT(vx1, alpha_i, vl);
+                vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, vl);
+
+                vy1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, vl);
+
+                vyx2 = VSET_VX2(vyx2, 0, vy0);
+                vyx2 = VSET_VX2(vyx2, 1, vy1);
+                VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+            }
+        }
+    }
+    else
+    {
+        FLOAT_V_T v0, v1;
+        FLOAT_VX2_T v_x2;
+
+        if ( alpha_r == 0.0 && alpha_i == 0.0 )
+        {
+            for (size_t vl; n > 0; n -= vl, y += vl*inc_y2) 
+            {
+                vl = VSETVL(n);
+
+                vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+                vy0 = VGET_VX2(vyx2, 0);
+                vy1 = VGET_VX2(vyx2, 1);
+                
+                v0 = VFMULVF_FLOAT(vy1, beta_i, vl);
+                v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, vl);
+
+                v1 = VFMULVF_FLOAT(vy1, beta_r, vl);
+                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
+
+                v_x2 = VSET_VX2(v_x2, 0, v0);
+                v_x2 = VSET_VX2(v_x2, 1, v1);
+                VSSSEG_FLOAT(y, stride_y, v_x2, vl);
+            }
+        }
+        else
+        {
+            for (size_t vl; n > 0; n -= vl, x += vl*inc_x2, y += vl*inc_y2) 
+            {
+                vl = VSETVL(n);
+
+                vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+                vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
+                vy0 = VGET_VX2(vyx2, 0);
+                vy1 = VGET_VX2(vyx2, 1);
+
+                v0 = VFMULVF_FLOAT(vx0, alpha_r, vl);
+                v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, vl);
+                v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, vl);
+                v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, vl);
+                
+                v1 = VFMULVF_FLOAT(vx1, alpha_r, vl);
+                v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, vl);
+                v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, vl);
+                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, vl);
+
+                v_x2 = VSET_VX2(v_x2, 0, v0);
+                v_x2 = VSET_VX2(v_x2, 1, v1);
+
+                VSSSEG_FLOAT(y, stride_y, v_x2, vl);
+            }
+        }
+    }
+    return(0);
+
+}
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
index 3eca20415..bbf2bbe7d 100644
--- a/kernel/riscv64/zaxpby_vector.c
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -28,25 +28,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFMSACVF_FLOAT vfmsac_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFMSACVF_FLOAT vfmsac_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFMSACVF_FLOAT RISCV_RVV(vfmsac_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y)
@@ -62,6 +62,82 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
         stride_x = inc_x * 2 * sizeof(FLOAT);
         stride_y = inc_y * 2 * sizeof(FLOAT);
 
+	if (inc_x == 0 || inc_y == 0) {
+
+	FLOAT temp;
+	BLASLONG inc_x2, inc_y2;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	if ( beta_r == 0.0 && beta_i == 0.0)
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				y[iy]   = 0.0 ;
+				y[iy+1] = 0.0 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				y[iy]   = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) ;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				temp    = ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				temp    = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) + ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) + ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+
+
+	}
+	return(0);
+
+	} else {
+
         if(beta_r == 0.0 && beta_i == 0.0){
                 if(alpha_r == 0.0 && alpha_i == 0.0){
                         if(inc_y == 1){
@@ -191,5 +267,6 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
                 }
         }
 	return(0);
+	}
 }
 
diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c
index 1dcaeac27..18b6315cb 100644
--- a/kernel/riscv64/zaxpy.c
+++ b/kernel/riscv64/zaxpy.c
@@ -44,7 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 	if ( da_r == 0.0 && da_i == 0.0 ) return(0);
 
 	ix = 0;
diff --git a/kernel/riscv64/zaxpy_rvv.c b/kernel/riscv64/zaxpy_rvv.c
new file mode 100644
index 000000000..0db32df10
--- /dev/null
+++ b/kernel/riscv64/zaxpy_rvv.c
@@ -0,0 +1,190 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+    if(n < 0) return(0);
+    if(da_r == 0.0 && da_i == 0.0) return(0);
+ 
+    FLOAT_V_T vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2;
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    } else if (inc_x == 1) {
+
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+
+    } else if (inc_y == 1) {
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+        #if !defined(CONJ)
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #else
+            vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, vl);
+            vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, vl);
+            vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, vl);
+        #endif
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
index 303d3541e..1e766c5f4 100644
--- a/kernel/riscv64/zaxpy_vector.c
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -28,27 +28,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
-	BLASLONG i = 0, j = 0;
-	BLASLONG ix = 0,iy = 0;
-	if(n < 0) return(0);
-	if(da_r == 0.0 && da_i == 0.0) return(0);
+        BLASLONG i = 0, j = 0;
+        BLASLONG ix = 0,iy = 0;
+        if(n <= 0) return(0);
+        if(da_r == 0.0 && da_i == 0.0) return(0);
         unsigned int gvl = 0;
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c
index 07fe584c5..b0f19efd5 100644
--- a/kernel/riscv64/zcopy.c
+++ b/kernel/riscv64/zcopy.c
@@ -43,7 +43,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	inc_x2 = 2 * inc_x;
 	inc_y2 = 2 * inc_y;
diff --git a/kernel/riscv64/zcopy_rvv.c b/kernel/riscv64/zcopy_rvv.c
new file mode 100644
index 000000000..13879f03b
--- /dev/null
+++ b/kernel/riscv64/zcopy_rvv.c
@@ -0,0 +1,105 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL_M8(n)            __riscv_vsetvl_e32m8(n)
+#define FLOAT_V_T_M8            vfloat32m8_t
+#define VLEV_FLOAT_M8           __riscv_vle32_v_f32m8
+#define VSEV_FLOAT_M8           __riscv_vse32_v_f32m8
+
+#define VSETVL_M4(n)            __riscv_vsetvl_e32m4(n)
+#define FLOAT_VX2_T_M4          vfloat32m4x2_t
+#define VLSEG_FLOAT_M4          __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT_M4          __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT_M4         __riscv_vssseg2e32_v_f32m4x2
+#else
+#define VSETVL_M8(n)            __riscv_vsetvl_e64m8(n)
+#define FLOAT_V_T_M8            vfloat64m8_t
+#define VLEV_FLOAT_M8           __riscv_vle64_v_f64m8
+#define VSEV_FLOAT_M8           __riscv_vse64_v_f64m8
+
+#define VSETVL_M4(n)            __riscv_vsetvl_e64m4(n)
+#define FLOAT_VX2_T_M4          vfloat64m4x2_t
+#define VLSEG_FLOAT_M4          __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT_M4          __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT_M4         __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT_M4         __riscv_vssseg2e64_v_f64m4x2
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+    if(n <= 0) return(0);
+
+    if(inc_x == 1 && inc_y == 1) {
+
+        FLOAT_V_T_M8 vx;
+        n *= 2; // convert to words
+
+        for(size_t vl; n > 0; n -= vl, x += vl, y += vl) {
+            vl = VSETVL_M8(n);
+            vx = VLEV_FLOAT_M8(x, vl);
+            VSEV_FLOAT_M8(y, vx, vl);
+        }
+
+    }else if (1 == inc_x) {
+
+        FLOAT_VX2_T_M4 vx2;
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL_M4(n);
+            vx2 = VLSEG_FLOAT_M4(x, vl);
+            VSSSEG_FLOAT_M4(y, stride_y, vx2, vl);
+        }
+    } else if (1 == inc_y) {
+
+        FLOAT_VX2_T_M4 vx2;
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL_M4(n);
+            vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl);
+            VSSEG_FLOAT_M4(y, vx2, vl);
+        }
+    } else {
+
+        FLOAT_VX2_T_M4 vx2;
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for(size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL_M4(n);
+            vx2 = VLSSEG_FLOAT_M4(x, stride_x, vl);
+            VSSSEG_FLOAT_M4(y, stride_y, vx2, vl);
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
index 600f02bba..9e4a67b71 100644
--- a/kernel/riscv64/zcopy_vector.c
+++ b/kernel/riscv64/zcopy_vector.c
@@ -27,15 +27,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
 #endif
 
 
diff --git a/kernel/riscv64/zdot_rvv.c b/kernel/riscv64/zdot_rvv.c
new file mode 100644
index 000000000..13bc2ee39
--- /dev/null
+++ b/kernel/riscv64/zdot_rvv.c
@@ -0,0 +1,194 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f32m4
+#define VFMSACVV_FLOAT          __riscv_vfmsac_vv_f32m4
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f32m4_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDSUM_FLOAT          __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f64m4
+#define VFMSACVV_FLOAT          __riscv_vfmsac_vv_f64m4
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f64m4_tu
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+    OPENBLAS_COMPLEX_FLOAT result;
+    CREAL(result) = 0.0;
+    CIMAG(result) = 0.0;
+
+    if ( n <= 0 ) return(result);
+
+    FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
+    FLOAT_V_T_M1 v_res, v_z0;
+    FLOAT_VX2_T vxx2, vyx2;
+    size_t vlmax_m1 = VSETVL_MAX_M1;
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax_m1);
+
+    size_t vlmax = VSETVL_MAX;
+    vr0 = VFMVVF_FLOAT(0, vlmax);
+    vr1 = VFMVVF_FLOAT(0, vlmax);
+ 
+    if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #endif
+        }
+
+    }  else if (inc_x == 1){
+
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #endif
+        }
+    } else if (inc_y == 1){
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #endif
+        }
+    }else {
+
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, vy0, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, vy1, vl);
+        #if !defined(CONJ)
+            vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #else
+            vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, vy1, vl);
+            vr1 = VFNMSACVV_FLOAT_TU(vr1, vx1, vy0, vl);
+        #endif
+        }
+    }
+
+    v_res = VFREDSUM_FLOAT(vr0, v_z0, vlmax);
+    CREAL(result) = VFMVFS_FLOAT_M1(v_res);
+    v_res = VFREDSUM_FLOAT(vr1, v_z0, vlmax);
+    CIMAG(result) = VFMVFS_FLOAT_M1(v_res);
+ 
+   return(result);
+}
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
index ec38ed9d2..13b8fe378 100644
--- a/kernel/riscv64/zdot_vector.c
+++ b/kernel/riscv64/zdot_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFMSACVV_FLOAT vfmsac_vv_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f32m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f32m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFMSACVV_FLOAT vfmsac_vv_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFDOTVV_FLOAT RISCV_RVV(vfdot_vv_f64m4)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
+#define VFMSACVV_FLOAT RISCV_RVV(vfmsac_vv_f64m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif
 
 OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
@@ -109,9 +117,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
                 ix += inc_xv;
                 iy += inc_yv;
         }
-        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
         dot[0] += VFMVFS_FLOAT(v_res);
-        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
         dot[1] += VFMVFS_FLOAT(v_res);
         //tail
         if(j < n){
@@ -132,9 +140,9 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
                 vr1 = VFMULVV_FLOAT(vx1, vy0, gvl);
                 vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl);
 #endif
-                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                 dot[0] += VFMVFS_FLOAT(v_res);
-                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                 dot[1] += VFMVFS_FLOAT(v_res);
         }
         CREAL(result) = dot[0];
diff --git a/kernel/riscv64/zgemm_beta_rvv.c b/kernel/riscv64/zgemm_beta_rvv.c
new file mode 100644
index 000000000..ee334801b
--- /dev/null
+++ b/kernel/riscv64/zgemm_beta_rvv.c
@@ -0,0 +1,130 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f32m4
+#define VFSUBVV_FLOAT           __riscv_vfsub_vv_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFADDVV_FLOAT           __riscv_vfadd_vv_f64m4
+#define VFSUBVV_FLOAT           __riscv_vfsub_vv_f64m4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1,
+          FLOAT beta_r, FLOAT beta_i,
+          FLOAT *dummy2, BLASLONG dummy3,
+          FLOAT *dummy4, BLASLONG dummy5,
+          FLOAT *c, BLASLONG ldc)
+{
+    BLASLONG chunk;
+    FLOAT *c_offset;
+	size_t vl;
+    FLOAT_V_T vr, vi, v1, v2, v3, v4;
+    FLOAT_VX2_T vx2;
+
+    ldc *= 2;
+    c_offset = c;
+
+    if (beta_r == 0.0 && beta_i == 0.0) {
+
+        vl = VSETVL(m);
+        vr = VFMVVF_FLOAT(0.0, vl);
+        vi = VFMVVF_FLOAT(0.0, vl);
+        vx2 = VSET_VX2(vx2, 0, vr);
+        vx2 = VSET_VX2(vx2, 1, vi);
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
+                vl = VSETVL(chunk);
+
+                VSSEG_FLOAT(c_offset, vx2, vl);
+			}
+		}
+
+    } else {
+
+        for( ; n > 0; n--, c += ldc) {
+            c_offset = c;
+
+            for(chunk=m; chunk > 0; chunk -= vl, c_offset += vl*2) {
+                vl = VSETVL(chunk);
+
+                vx2 = VLSEG_FLOAT(c_offset, vl);
+                vr = VGET_VX2(vx2, 0);
+                vi = VGET_VX2(vx2, 1);
+
+                v1 = VFMULVF_FLOAT(vr, beta_r, vl);
+                v2 = VFMULVF_FLOAT(vi, beta_i, vl);
+
+                v3 = VFMULVF_FLOAT(vi, beta_r, vl);
+                v4 = VFMULVF_FLOAT(vr, beta_i, vl);
+
+				vr = VFSUBVV_FLOAT(v1, v2, vl);
+				vi = VFADDVV_FLOAT(v3, v4, vl);
+
+                vx2 = VSET_VX2(vx2, 0, vr);
+                vx2 = VSET_VX2(vx2, 1, vi);
+                VSSEG_FLOAT(c_offset, vx2, vl);
+			}
+		}
+
+	}
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
new file mode 100644
index 000000000..0776f03fd
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_4x4_zvl128b.c
@@ -0,0 +1,720 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=4
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f64m2'
+ VFMUL='__riscv_vfmul_vf_f64m2'
+ VLEV='__riscv_vle64_v_f64m2'
+ VLSEV='__riscv_vlse64_v_f64m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m2'
+ VSETVL='__riscv_vsetvl_e64m2'
+ VSEV='__riscv_vse64_v_f64m2'
+ VSSEV='__riscv_vsse64_v_f64m2'
+ acc_vector_t='vfloat64m2_t'
+ output='zgemm_kernel_4x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m2(4);
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            double B2r = B[bi + 2 * 2 + 0];
+            double B2i = B[bi + 2 * 2 + 1];
+            double B3r = B[bi + 3 * 2 + 0];
+            double B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat64m2_t ACC2r = tmp0r;
+            vfloat64m2_t ACC2i = tmp0i;
+            vfloat64m2_t ACC3r = tmp1r;
+            vfloat64m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C2r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C2i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C3r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C3i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C2r = __riscv_vfmacc(C2r, alphar, ACC2r, gvl);
+            C2i = __riscv_vfmacc(C2i, alphar, ACC2i, gvl);
+            C3r = __riscv_vfmacc(C3r, alphar, ACC3r, gvl);
+            C3i = __riscv_vfmacc(C3i, alphar, ACC3i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        // -- tails for main pass
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result8 * alphar;
+            Ci += result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 1) * 2 + 1];
+            Cr += result10 * alphar;
+            Ci += result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result12 * alphar;
+            Ci += result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 1) * 2 + 1];
+            Cr += result14 * alphar;
+            Ci += result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 2 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 2 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 3 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 3 * ldc + 0) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ci += ldc - gvl * 0;
+            vfloat64m2_t C1r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C1i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C1r = __riscv_vfmacc(C1r, alphar, ACC1r, gvl);
+            C1i = __riscv_vfmacc(C1i, alphar, ACC1i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result4 * alphar;
+            Ci += result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 1) * 2 + 1];
+            Cr += result6 * alphar;
+            Ci += result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 1 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 1 * ldc + 0) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vlse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t C0i = __riscv_vlse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, gvl);
+
+            C0r = __riscv_vfmacc(C0r, alphar, ACC0r, gvl);
+            C0i = __riscv_vfmacc(C0i, alphar, ACC0i, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+
+            ci = n_top * ldc + m_top;
+
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = C[(ci + 0 * ldc + 1) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 1) * 2 + 1];
+            Cr += result2 * alphar;
+            Ci += result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+
+            for (BLASLONG k = 0; k < K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = C[(ci + 0 * ldc + 0) * 2 + 0];
+            Ci = C[(ci + 0 * ldc + 0) * 2 + 1];
+            Cr += result0 * alphar;
+            Ci += result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
new file mode 100644
index 000000000..ca33368f0
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_8x4_zvl256b.c
@@ -0,0 +1,1253 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='gemm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='zgemm_kernel_8x4_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/4; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+            tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+            tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+            vfloat64m1_t ACC4r = tmp0r;
+            vfloat64m1_t ACC4i = tmp0i;
+            vfloat64m1_t ACC5r = tmp1r;
+            vfloat64m1_t ACC5i = tmp1i;
+            vfloat64m1_t ACC6r = tmp2r;
+            vfloat64m1_t ACC6i = tmp2i;
+            vfloat64m1_t ACC7r = tmp3r;
+            vfloat64m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C4r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C4i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C5r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C5i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C6r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C6i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C7r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C7i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C4r = __riscv_vfmacc( C4r, alphar, ACC4r, gvl );
+            C4i = __riscv_vfmacc( C4i, alphar, ACC4i, gvl );
+            C5r = __riscv_vfmacc( C5r, alphar, ACC5r, gvl );
+            C5i = __riscv_vfmacc( C5i, alphar, ACC5i, gvl );
+            C6r = __riscv_vfmacc( C6r, alphar, ACC6r, gvl );
+            C6i = __riscv_vfmacc( C6i, alphar, ACC6i, gvl );
+            C7r = __riscv_vfmacc( C7r, alphar, ACC7r, gvl );
+            C7i = __riscv_vfmacc( C7i, alphar, ACC7i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+2*ldc+0)*2+0];
+            Ci = C[(ci+2*ldc+0)*2+1];
+            Cr += result4*alphar;
+            Ci += result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+3*ldc+0)*2+0];
+            Ci = C[(ci+3*ldc+0)*2+1];
+            Cr += result6*alphar;
+            Ci += result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 20 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*1;
+            vfloat64m1_t C2r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C2i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C3r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C3i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C2r = __riscv_vfmacc( C2r, alphar, ACC2r, gvl );
+            C2i = __riscv_vfmacc( C2i, alphar, ACC2i, gvl );
+            C3r = __riscv_vfmacc( C3r, alphar, ACC3r, gvl );
+            C3i = __riscv_vfmacc( C3i, alphar, ACC3i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            ci += ldc-gvl*0;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = C[(ci+1*ldc+0)*2+0];
+            Ci = C[(ci+1*ldc+0)*2+1];
+            Cr += result2*alphar;
+            Ci += result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 24 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+             ci += gvl;
+            vfloat64m1_t C1r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C1i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C1r = __riscv_vfmacc( C1r, alphar, ACC1r, gvl );
+            C1i = __riscv_vfmacc( C1i, alphar, ACC1i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vlse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t C0i = __riscv_vlse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, gvl );
+            
+            C0r = __riscv_vfmacc( C0r, alphar, ACC0r, gvl );
+            C0i = __riscv_vfmacc( C0i, alphar, ACC0i, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+
+            ci=n_top*ldc+m_top;
+
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+
+            for(BLASLONG k=0; k<K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = C[(ci+0*ldc+0)*2+0];
+            Ci = C[(ci+0*ldc+0)*2+1];
+            Cr += result0*alphar;
+            Ci += result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/zgemm_kernel_generic.c b/kernel/riscv64/zgemm_kernel_generic.c
new file mode 100644
index 000000000..a46dc9171
--- /dev/null
+++ b/kernel/riscv64/zgemm_kernel_generic.c
@@ -0,0 +1,140 @@
+#include "common.h"
+
+
+/* for debugging/unit tests
+ * this is a drop-in replacement for zgemm/cgemm/ztrmm/ctrmm kernels that supports arbitrary combinations of unroll values
+ */
+
+#ifdef TRMMKERNEL
+    #if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+    #endif
+#endif
+
+#ifdef DOUBLE
+
+#define UNROLL_M ZGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N ZGEMM_DEFAULT_UNROLL_N
+
+#else
+
+#define UNROLL_M CGEMM_DEFAULT_UNROLL_M
+#define UNROLL_N CGEMM_DEFAULT_UNROLL_N
+
+#endif
+
+int CNAME(BLASLONG M,BLASLONG N,BLASLONG K,FLOAT alphar,FLOAT alphai,FLOAT* A,FLOAT* B,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+    ,BLASLONG offset
+#endif    
+    )
+{
+    FLOAT res[UNROLL_M*UNROLL_N*2];
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    FLOAT sign[4] = { 1, -1,  1,  1};
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    FLOAT sign[4] = { 1,  1,  1, -1};
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    FLOAT sign[4] = { 1,  1, -1,  1};
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    FLOAT sign[4] = { 1, -1, -1, -1};
+#endif
+
+    BLASLONG n_packing = UNROLL_N;
+    BLASLONG n_top = 0;
+
+    while(n_top < N)
+    {
+        while( n_top+n_packing > N )
+            n_packing >>= 1;
+
+        BLASLONG m_packing = UNROLL_M;
+        BLASLONG m_top = 0;
+        while (m_top < M)
+        {
+            while( m_top+m_packing > M )
+                m_packing >>= 1;
+
+            BLASLONG ai = K*m_top*2;
+            BLASLONG bi = K*n_top*2;
+
+            BLASLONG pass_K = K;
+
+
+            #ifdef TRMMKERNEL
+                #ifdef LEFT
+                    BLASLONG off = offset + m_top;
+                #else
+                    BLASLONG off = -offset + n_top;
+                #endif
+                #ifdef BACKWARDS
+                    ai += off * m_packing*2;
+                    bi += off * n_packing*2;
+                    pass_K -= off; 
+                #else
+                    #ifdef LEFT
+                        pass_K = off + m_packing;
+                    #else
+                        pass_K = off + n_packing;
+                    #endif
+                #endif
+            #endif
+
+            memset( res, 0, UNROLL_M*UNROLL_N*2*sizeof(FLOAT) );
+
+            for (BLASLONG k=0; k<pass_K; k+=1)
+            {
+                for( BLASLONG ki = 0; ki < n_packing; ++ki )
+                {
+                    FLOAT B0 = B[bi+ki*2+0];
+                    FLOAT B1 = B[bi+ki*2+1];
+
+                    for( BLASLONG kj = 0; kj < m_packing; ++kj )
+                    {
+                        FLOAT A0 = A[ai+kj*2+0];
+                        FLOAT A1 = A[ai+kj*2+1];
+
+                        res[(ki*UNROLL_M+kj)*2+0] += sign[0]*A0*B0 +sign[1]*A1*B1;
+                        res[(ki*UNROLL_M+kj)*2+1] += sign[2]*A1*B0 +sign[3]*A0*B1;
+                    }
+                }
+
+                ai += m_packing*2;
+                bi += n_packing*2;
+            }
+
+            BLASLONG cofs = ldc * n_top + m_top;
+            for( BLASLONG ki = 0; ki < n_packing; ++ki )
+            {
+                for( BLASLONG kj = 0; kj < m_packing; ++kj )
+                {
+                    #ifdef TRMMKERNEL
+                    FLOAT Cr = 0;
+                    FLOAT Ci = 0;
+                    #else
+                    FLOAT Cr = C[(cofs+ki*ldc+kj)*2+0];
+                    FLOAT Ci = C[(cofs+ki*ldc+kj)*2+1];
+                    #endif
+
+                    Cr += res[(ki*UNROLL_M+kj)*2+0]*alphar;
+                    Cr += -res[(ki*UNROLL_M+kj)*2+1]*alphai;
+                    Ci += res[(ki*UNROLL_M+kj)*2+1]*alphar;
+                    Ci += res[(ki*UNROLL_M+kj)*2+0]*alphai;
+
+                    C[(cofs+ki*ldc+kj)*2+0] = Cr; 
+                    C[(cofs+ki*ldc+kj)*2+1] = Ci;
+                }
+            }
+
+            m_top += m_packing;
+        }
+
+        n_top += n_packing;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_ncopy_4_rvv.c b/kernel/riscv64/zgemm_ncopy_4_rvv.c
new file mode 100644
index 000000000..dce98752e
--- /dev/null
+++ b/kernel/riscv64/zgemm_ncopy_4_rvv.c
@@ -0,0 +1,149 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VGET_VX2                __riscv_vget_v_f32m1x2_f32m1
+#define VSET_VX2                __riscv_vset_v_f32m1_f32m1x2
+#define VSET_VX4                __riscv_vset_v_f32m1_f32m1x4
+#define VSET_VX8                __riscv_vset_v_f32m1_f32m1x8
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VGET_VX2                __riscv_vget_v_f64m1x2_f64m1
+#define VSET_VX2                __riscv_vset_v_f64m1_f64m1x2
+#define VSET_VX4                __riscv_vset_v_f64m1_f64m1x4
+#define VSET_VX8                __riscv_vset_v_f64m1_f64m1x8
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m1x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
+#endif
+
+// Optimizes the implementation in ../generic/zgemm_ncopy_4.c
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+    BLASLONG i, j;
+
+    FLOAT *aoffset;
+    FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+
+    FLOAT *boffset;
+
+    FLOAT_VX2_T v1x2, v2x2, v3x2, v4x2;
+    FLOAT_VX4_T vxx4;
+    FLOAT_VX8_T vxx8;
+    size_t vl;
+
+    aoffset = a;
+    boffset = b;
+    lda *= 2;
+
+    for (j = (n >> 2); j > 0; j--) {
+        aoffset1  = aoffset;
+        aoffset2  = aoffset1 + lda;
+        aoffset3  = aoffset2 + lda;
+        aoffset4  = aoffset3 + lda;
+        aoffset  += 4 * lda;
+
+        for (i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+            v1x2 = VLSEG2_FLOAT(aoffset1, vl);
+            v2x2 = VLSEG2_FLOAT(aoffset2, vl);
+            v3x2 = VLSEG2_FLOAT(aoffset3, vl);
+            v4x2 = VLSEG2_FLOAT(aoffset4, vl);
+            
+            vxx8 = VSET_VX8(vxx8, 0, VGET_VX2(v1x2, 0));
+            vxx8 = VSET_VX8(vxx8, 1, VGET_VX2(v1x2, 1));
+            vxx8 = VSET_VX8(vxx8, 2, VGET_VX2(v2x2, 0));
+            vxx8 = VSET_VX8(vxx8, 3, VGET_VX2(v2x2, 1));
+            vxx8 = VSET_VX8(vxx8, 4, VGET_VX2(v3x2, 0));
+            vxx8 = VSET_VX8(vxx8, 5, VGET_VX2(v3x2, 1));
+            vxx8 = VSET_VX8(vxx8, 6, VGET_VX2(v4x2, 0));
+            vxx8 = VSET_VX8(vxx8, 7, VGET_VX2(v4x2, 1));
+
+            VSSEG8_FLOAT(boffset, vxx8, vl);
+
+            aoffset1 += vl * 2;
+            aoffset2 += vl * 2;
+            aoffset3 += vl * 2;
+            aoffset4 += vl * 2;
+            boffset  += vl * 8;
+        }
+    }
+
+    if (n & 2) {
+        aoffset1  = aoffset;
+        aoffset2  = aoffset1 + lda;
+        aoffset  += 2 * lda;
+        
+        for (i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+            v1x2 = VLSEG2_FLOAT(aoffset1, vl);
+            v2x2 = VLSEG2_FLOAT(aoffset2, vl);
+
+            vxx4 = VSET_VX4(vxx4, 0, VGET_VX2(v1x2, 0));
+            vxx4 = VSET_VX4(vxx4, 1, VGET_VX2(v1x2, 1));
+            vxx4 = VSET_VX4(vxx4, 2, VGET_VX2(v2x2, 0));
+            vxx4 = VSET_VX4(vxx4, 3, VGET_VX2(v2x2, 1));
+        
+            VSSEG4_FLOAT(boffset, vxx4, vl);
+        
+            aoffset1 += vl * 2;
+            aoffset2 += vl * 2;
+            boffset  += vl * 4;
+        }
+    }
+
+    if (n & 1) {
+        aoffset1  = aoffset;
+        aoffset  += lda;
+
+        for (i = m; i > 0; i -= vl) {
+            vl = VSETVL(i);
+            v1x2 = VLSEG2_FLOAT(aoffset1, vl);
+
+            VSSEG2_FLOAT(boffset, v1x2, vl);
+
+            aoffset1 += vl * 2;
+            boffset  += vl * 2;
+        }
+    }
+
+     return 0;
+}
diff --git a/kernel/riscv64/zgemm_ncopy_rvv_v1.c b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
new file mode 100644
index 000000000..275daa5f2
--- /dev/null
+++ b/kernel/riscv64/zgemm_ncopy_rvv_v1.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+
+    BLASLONG i, j;
+
+    FLOAT *a_offset;
+    FLOAT *a_offset1;
+    FLOAT *b_offset;
+
+    FLOAT_VX2_T vx2;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+    a_offset = a;
+    b_offset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        a_offset1 = a_offset;
+        a_offset += vl * lda * 2;
+
+        for(i = m; i > 0; i--) {
+            vx2 = VLSSEG2_FLOAT(a_offset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(b_offset, vx2, vl);
+
+            a_offset1 += 2;
+            b_offset += vl * 2;
+        }
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/zgemm_tcopy_4_rvv.c b/kernel/riscv64/zgemm_tcopy_4_rvv.c
new file mode 100644
index 000000000..cfafbf0dc
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_4_rvv.c
@@ -0,0 +1,191 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m1(n)
+#define FLOAT_V_T               vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m1x2_t
+#define FLOAT_VX4_T             vfloat32m1x4_t
+#define FLOAT_VX8_T             vfloat32m1x8_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m1
+#define VSEV_FLOAT              __riscv_vse32_v_f32m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e32_v_f32m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e32_v_f32m1x8
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e32_v_f32m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e32_v_f32m1x8
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m1(n)
+#define FLOAT_V_T               vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m1x2_t
+#define FLOAT_VX4_T             vfloat64m1x4_t
+#define FLOAT_VX8_T             vfloat64m1x8_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m1
+#define VSEV_FLOAT              __riscv_vse64_v_f64m1
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m1x2
+#define VLSSEG4_FLOAT           __riscv_vlsseg4e64_v_f64m1x4
+#define VLSSEG8_FLOAT           __riscv_vlsseg8e64_v_f64m1x8
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m1x2
+#define VSSEG4_FLOAT            __riscv_vsseg4e64_v_f64m1x4
+#define VSSEG8_FLOAT            __riscv_vsseg8e64_v_f64m1x8
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
+
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+
+    IFLOAT *boffset, *boffset1, *boffset2, *boffset3;
+
+    FLOAT_V_T v0;
+    FLOAT_VX2_T vx2;
+    FLOAT_VX4_T vx4;
+    FLOAT_VX8_T vx8;
+
+    size_t vl;
+
+    //fprintf(stderr, "%s m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    aoffset   = a;
+    boffset   = b;
+    boffset2  = b + 2 * m  * (n & ~3);
+    boffset3  = b + 2 * m  * (n & ~1);
+
+    for(j = (m >> 2); j > 0; j--) {
+
+        aoffset1  = aoffset;
+        aoffset += 8 * lda;
+
+        boffset1  = boffset;
+        boffset  += 32;
+
+        for(i = (n >> 2); i > 0; i--) {
+            vl = 4;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 2) {
+            vl = 4;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 16;
+        }
+
+        if (n & 1) {
+            vl = 4;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            aoffset1 += 2;
+            boffset3 += 8;
+        }
+    }
+
+    if (m & 2) {
+        aoffset1  = aoffset;
+        aoffset += 4 * lda;
+
+        boffset1  = boffset;
+        boffset  += 16;
+
+        for(i = (n >> 2); i > 0; i--) {
+            vl = 2;
+
+            vx8 = VLSSEG8_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG8_FLOAT(boffset1, vx8, vl);
+
+            aoffset1 += 8;
+            boffset1 += m * 8;
+        }
+
+        if (n & 2) {
+            vl = 2;
+
+            vx4 = VLSSEG4_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG4_FLOAT(boffset2, vx4, vl);
+
+            aoffset1 += 4;
+            boffset2 += 8;
+        }
+
+        if (n & 1) {
+            vl = 2;
+
+            vx2 = VLSSEG2_FLOAT(aoffset1, lda * sizeof(FLOAT) * 2, vl);
+            VSSEG2_FLOAT(boffset3, vx2, vl);
+
+            //aoffset1 += 2;
+            boffset3 += 4;
+        }
+    }
+
+    if (m & 1) {
+        aoffset1  = aoffset;
+        boffset1  = boffset;
+
+        for(i = (n >> 2); i > 0; i--) {
+            vl = 8;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset1, v0, vl);
+
+            aoffset1 += 8;
+            boffset1 += 8 * m;
+        }
+
+        if (n & 2) {
+            vl = 4;
+
+            v0 = VLEV_FLOAT(aoffset1, vl);
+            VSEV_FLOAT(boffset2, v0, vl);
+
+            aoffset1 += 4;
+            //boffset2 += 4;
+        }
+
+        if (n & 1) {
+           *(boffset3) = *(aoffset1);
+           *(boffset3 + 1) = *(aoffset1 + 1);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemm_tcopy_rvv_v1.c b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
new file mode 100644
index 000000000..96e986502
--- /dev/null
+++ b/kernel/riscv64/zgemm_tcopy_rvv_v1.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
+{
+    BLASLONG i, j;
+
+    IFLOAT *aoffset;
+    IFLOAT *aoffset1;
+    IFLOAT *boffset;
+
+    FLOAT_VX2_T vx2;
+    size_t vl;
+
+    //fprintf(stderr, "%s, m=%ld n=%ld lda=%ld\n", __FUNCTION__, m, n, lda);
+
+    aoffset = a;
+    boffset = b;
+
+    for(j = n; j > 0; j -= vl) {
+        vl = VSETVL(j);
+
+        aoffset1 = aoffset;
+        aoffset += vl * 2;
+
+        for(i = m; i > 0; i--) {
+            vx2 = VLSEG2_FLOAT(aoffset1, vl);
+            VSSEG2_FLOAT(boffset, vx2, vl);
+
+            aoffset1 += lda * 2;
+            boffset += vl * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zgemmkernel_rvv_v1x4.c b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
new file mode 100644
index 000000000..77e012ff5
--- /dev/null
+++ b/kernel/riscv64/zgemmkernel_rvv_v1x4.c
@@ -0,0 +1,553 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#endif
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef	TRMMKERNEL
+		, BLASLONG offset
+#endif
+		)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb;
+
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+
+    //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc); // Debug
+
+    size_t vl;
+    for (j = bn/4; j > 0; j--)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+        C2 = C1 + 2 * ldc;
+        C3 = C2 + 2 * ldc;
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+
+            for (k = bk/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va2, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va3, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va3, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va2, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va2, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va3, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va3, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va2, vl);
+                
+                ptrbb += 8;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va4, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va5, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va5, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va4, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va4, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va5, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va5, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va4, vl);
+                ptrbb += 8;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va6, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va7, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va7, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va6, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va6, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va7, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va7, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va6, vl);
+
+                ptrbb += 8;
+            }
+
+            for (k = (bk & 3); k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+            }
+
+            vax2 = VLSEG2_FLOAT(C0, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            vax2 = VLSEG2_FLOAT(C1, vl);
+            va2 = VGET_VX2(vax2, 0);
+            va3 = VGET_VX2(vax2, 1);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+            va2 =  VFMACCVF_FLOAT(va2, alphar, vres2, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphar, vres3, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
+
+            vax2 = VLSEG2_FLOAT(C2, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            vax2 = VLSEG2_FLOAT(C3, vl);
+            va2 = VGET_VX2(vax2, 0);
+            va3 = VGET_VX2(vax2, 1);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres4, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres5, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres4, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C2, vax2, vl);
+
+            va2 =  VFMACCVF_FLOAT(va2, alphar, vres6, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphar, vres7, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres6, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C3, vax2, vl);
+
+            C0 += vl * 2;
+            C1 += vl * 2;
+            C2 += vl * 2;
+            C3 += vl * 2;
+        }
+
+        bb += (bk << 3);
+        C  += (ldc << 3);
+    }
+
+    if (bn & 2)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+            for (k = bk/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+                
+                ptrbb += 4;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                ptrbb += 4;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                ptrbb += 4;
+            }
+
+            for (k = (bk & 3); k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+                
+                ptrbb += 4;
+            }
+
+            vax2 = VLSEG2_FLOAT(C0, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            vax2 = VLSEG2_FLOAT(C1, vl);
+            va2 = VGET_VX2(vax2, 0);
+            va3 = VGET_VX2(vax2, 1);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+            va2 =  VFMACCVF_FLOAT(va2, alphar, vres2, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphar, vres3, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
+
+            C0 += vl * 2;
+            C1 += vl * 2;
+        }
+
+        bb += (bk << 2);
+        C  += (ldc << 2);
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+            ptrbb = bb;
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+
+            for (k = bk/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+
+                ptrbb += 2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                ptrbb += 2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                ptrbb += 2;
+            }
+
+            for (k = (bk & 3); k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+                ptrbb += 2;
+            }
+            
+            vax2 = VLSEG2_FLOAT(C0, vl);
+            va0 = VGET_VX2(vax2, 0);
+            va1 = VGET_VX2(vax2, 1);
+
+            va0 =  VFMACCVF_FLOAT(va0, alphar, vres0, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphar, vres1, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+            C0 += vl * 2;
+        }
+
+        bb += bk << 1;
+        C  += ldc << 1;
+   }
+   return 0;
+}
+
diff --git a/kernel/riscv64/zgemv_n_rvv.c b/kernel/riscv64/zgemv_n_rvv.c
new file mode 100644
index 000000000..f14ef5ba8
--- /dev/null
+++ b/kernel/riscv64/zgemv_n_rvv.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i;
+    BLASLONG ix;
+    FLOAT *a_ptr;
+    FLOAT temp_r, temp_i;
+    FLOAT_V_T va0, va1, vy0, vy1;
+    FLOAT_VX2_T vax2, vyx2;
+
+    BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
+
+    BLASLONG inc_x2 = inc_x * 2;
+    BLASLONG lda2 = lda * 2;
+    if (inc_y == 1)
+    {
+        for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*2) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            ix = 0;
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+                temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+                temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                vax2 = VLSEG_FLOAT(a_ptr, vl);
+
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+#if !defined(CONJ)
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#else
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#endif
+                a_ptr += lda2;
+                ix += inc_x2;
+            }
+
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    }
+    else
+    {
+        for (size_t vl; m > 0; m -= vl, a += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(m);
+            a_ptr = a;
+            ix = 0;
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+                temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+                temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+                temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                vax2 = VLSEG_FLOAT(a_ptr, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+#if !defined(CONJ)
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#else
+#if !defined(XCONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, vl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, vl);
+                vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, vl);
+                vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, vl);
+#endif
+#endif
+                a_ptr += lda2;
+                ix += inc_x2;
+            }
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+    }
+    return(0);
+}
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
index b5ee1f054..104d3865d 100644
--- a/kernel/riscv64/zgemv_n_vector.c
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -27,23 +27,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
diff --git a/kernel/riscv64/zgemv_t_rvv.c b/kernel/riscv64/zgemv_t_rvv.c
new file mode 100644
index 000000000..1c89a9f72
--- /dev/null
+++ b/kernel/riscv64/zgemv_t_rvv.c
@@ -0,0 +1,187 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDSUM_FLOAT_TU       __riscv_vfredusum_vs_f32m4_f32m1_tu
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f32m4_tu
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f32m4_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDSUM_FLOAT_TU       __riscv_vfredusum_vs_f64m4_f64m1_tu
+#define VFMACCVV_FLOAT_TU       __riscv_vfmacc_vv_f64m4_tu
+#define VFNMSACVV_FLOAT_TU      __riscv_vfnmsac_vv_f64m4_tu
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT           __riscv_vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+    BLASLONG i = 0, j = 0;
+    BLASLONG ix = 0, iy = 0;
+    FLOAT *a_ptr = a;
+    FLOAT temp_r, temp_i;
+
+    FLOAT_V_T va0, va1, vx0, vx1, vr, vi; 
+    FLOAT_V_T_M1 v_res, v_z0;
+    FLOAT_VX2_T vxx2, vax2;
+
+    BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+    //BLASLONG stride_a = sizeof(FLOAT) * 2;
+    BLASLONG inc_y2 = inc_y * 2;
+    BLASLONG lda2 = lda * 2;
+
+    size_t vlmax = VSETVL_MAX_M1;
+    v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_z0 = VFMVVF_FLOAT_M1(0, vlmax);
+    vlmax = VSETVL(m);
+
+    if (inc_x == 1)
+    {
+        for(i = 0; i < n; i++) {    
+            j = 0;
+            ix = 0;
+            vr = VFMVVF_FLOAT(0, vlmax);
+            vi = VFMVVF_FLOAT(0, vlmax);
+            for(size_t vl, k = m; k > 0; k -= vl) {
+                vl = VSETVL(k);
+
+                vax2 = VLSEG_FLOAT(&a_ptr[j], vl);
+                vxx2 = VLSEG_FLOAT(&x[ix], vl);
+
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl);
+#else
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl);
+#endif
+                j += vl * 2;
+                ix += vl * inc_x * 2;
+            }
+            
+            v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax);
+            temp_r = VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax);
+            temp_i = VFMVFS_FLOAT_M1(v_res);
+
+#if !defined(XCONJ)
+            y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+            y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+            y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+            iy += inc_y2;
+            a_ptr += lda2;
+        }
+    }
+    else
+    {
+        for(i = 0; i < n; i++) {    
+            j = 0;
+            ix = 0;
+            vr = VFMVVF_FLOAT(0, vlmax);
+            vi = VFMVVF_FLOAT(0, vlmax);
+            for(size_t vl, k = m; k > 0; k -= vl) {
+                vl = VSETVL(k);
+    
+                vax2 = VLSEG_FLOAT(&a_ptr[j], vl);
+                vxx2 = VLSSEG_FLOAT(&x[ix], stride_x, vl);
+
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                vx0 = VGET_VX2(vxx2, 0);
+                vx1 = VGET_VX2(vxx2, 1);
+    
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFNMSACVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va1, vx0, vl);
+#else
+                vr = VFMACCVV_FLOAT_TU(vr, va0, vx0, vl);
+                vr = VFMACCVV_FLOAT_TU(vr, va1, vx1, vl);
+                vi = VFMACCVV_FLOAT_TU(vi, va0, vx1, vl);
+                vi = VFNMSACVV_FLOAT_TU(vi, va1, vx0, vl);
+#endif
+                j += vl * 2;
+                ix += vl * inc_x * 2;
+            }
+            
+            v_res = VFREDSUM_FLOAT_TU(v_res, vr, v_z0, vlmax);
+            temp_r = VFMVFS_FLOAT_M1(v_res);
+            v_res = VFREDSUM_FLOAT_TU(v_res, vi, v_z0, vlmax);
+            temp_i = VFMVFS_FLOAT_M1(v_res);
+    
+#if !defined(XCONJ)
+            y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+            y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+            y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+            y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+            iy += inc_y2;
+            a_ptr += lda2;
+        }
+
+    }
+
+
+    return(0);
+}
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
index e930dc2a2..5d85ab3a4 100644
--- a/kernel/riscv64/zgemv_t_vector.c
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -27,31 +27,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m2)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
+#define FLOAT_V_T vfloat32m2_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(vr, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m2_f32m1)(va, vb, gvl)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m2)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m2)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m2)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
+#define FLOAT_V_T vfloat64m2_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m2)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(vr, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT(vr, va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m2_f64m1)(va, vb, gvl)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m2)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m2)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m2)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m2)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
@@ -62,49 +70,43 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
         FLOAT temp_r, temp_i;
 
         FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
-        unsigned int gvl = 0;
-        FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
-
+        unsigned int gvl = VSETVL(m);
+        FLOAT_V_T_M1 v_res_r, v_res_i;
         BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
         BLASLONG stride_a = sizeof(FLOAT) * 2;
-        gvl = VSETVL(m);
         BLASLONG inc_xv = inc_x * gvl * 2;
         BLASLONG inc_av = gvl * 2;
         BLASLONG inc_y2 = inc_y * 2;
         BLASLONG lda2 = lda * 2;
+
         for(i = 0; i < n; i++){
+                v_res_r = VFMVVF_FLOAT_M1(0, 1);
+                v_res_i = VFMVVF_FLOAT_M1(0, 1);
                 gvl = VSETVL(m);
                 j = 0;
                 ix = 0;
-                vr = VFMVVF_FLOAT(0, gvl);
-                vi = VFMVVF_FLOAT(0, gvl);
                 for(k = 0; k < m/gvl; k++){
                         va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
                         va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
                         vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
                         vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
                         vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
-                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
                         vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
 #else
-                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
                         vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
-                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
                         vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
-
 #endif
+                        v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl);
+                        v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl);
+
                         j += inc_av;
                         ix += inc_xv;
                 }
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                temp_r = VFMVFS_FLOAT(v_res);
-                v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
-                temp_i = VFMVFS_FLOAT(v_res);
                 if(j/2 < m){
                         gvl = VSETVL(m-j/2);
                         va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
@@ -113,21 +115,23 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
                         vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
                         vr = VFMULVV_FLOAT(va0, vx0, gvl);
-                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
 #else
                         vr = VFMULVV_FLOAT(va0, vx0, gvl);
-                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
                         vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
 
 #endif
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        temp_r += VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(v_res, vi, v_z0, gvl);
-                        temp_i += VFMVFS_FLOAT(v_res);
+                        v_res_r = VFREDSUM_FLOAT(v_res_r, vr, v_res_r, gvl);
+                        v_res_i = VFREDSUM_FLOAT(v_res_i, vi, v_res_i, gvl);
                 }
+
+                temp_r = VFMVFS_FLOAT(v_res_r);
+                temp_i = VFMVFS_FLOAT(v_res_i);
+
 #if !defined(XCONJ)
                 y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
                 y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
diff --git a/kernel/riscv64/zhemm_ltcopy_rvv_v1.c b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..97013895a
--- /dev/null
+++ b/kernel/riscv64/zhemm_ltcopy_rvv_v1.c
@@ -0,0 +1,139 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i32m2_b16
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i64m2_b32
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY);
+
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
+
+    FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
+    VBOOL_T vbool_gt0, vbool_lt0, vbool_eq0;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+    vzero = VFMVVF_FLOAT(ZERO, vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posX * 2 + posY * lda * 2;
+        ao2 = a + posY * 2 + posX * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+            va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl);
+            va1x2 = VLSEG2_FLOAT(ao1, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool_gt0  = VMSGT_VX_INT(vindex, 0, vl);
+            vbool_lt0  = VMSLT_VX_INT(vindex, 0, vl);
+            vbool_eq0  = VMSEQ_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl);
+
+            vb2 = VFRSUB_VF_FLOAT(vb1, ZERO, vl);
+
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vb2, vbool_lt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
+
+            b   += vl * 2;
+            ao1 += lda * 2;
+            ao2 += 2;
+        }
+    }
+    
+    return 0;
+}
+
diff --git a/kernel/riscv64/zhemm_utcopy_rvv_v1.c b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..59029e9e5
--- /dev/null
+++ b/kernel/riscv64/zhemm_utcopy_rvv_v1.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i32m2_b16
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VFRSUB_VF_FLOAT         __riscv_vfrsub_vf_f64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VMSLT_VX_INT            __riscv_vmslt_vx_i64m2_b32
+#define VMSEQ_VX_INT            __riscv_vmseq_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+    //fprintf(stderr, "%s, %s, m=%ld n=%ld lda=%ld posX=%ld posY=%ld\n", __FUNCTION__, __FILE__, m, n, lda, posX, posY);
+    BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
+    
+    FLOAT_V_T vb0, vb1, vb2, va10, va11, va20, va21, vzero;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
+    VBOOL_T vbool_gt0, vbool_eq0;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+    vzero = VFMVVF_FLOAT(ZERO, vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posY * 2 + posX * lda * 2;
+        ao2 = a + posX * 2 + posY * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+            va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl);
+            va2x2 = VLSEG2_FLOAT(ao2, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool_gt0  = VMSGT_VX_INT(vindex, 0, vl);
+            vbool_eq0  = VMSEQ_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool_gt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool_gt0, vl);
+
+            vb2 =  VFRSUB_VF_FLOAT(vb1, ZERO, vl);
+
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vb2, vbool_gt0, vl);
+            vb1 =  VMERGE_VVM_FLOAT(vb1, vzero, vbool_eq0, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
+
+            b   += vl * 2;
+            ao1 += 2;
+            ao2 += lda * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zhemv_LM_rvv.c b/kernel/riscv64/zhemv_LM_rvv.c
new file mode 100644
index 000000000..95c6a377c
--- /dev/null
+++ b/kernel/riscv64/zhemv_LM_rvv.c
@@ -0,0 +1,198 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        jx = 0;
+        jy = 0;
+        ja = 0;
+        for(j = 0; j < offset; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+                ix = jx + inc_x2;
+                iy = jy + inc_y2;
+                ia = ja + 2;
+                i = j + 1;
+                len = m - i;
+                if(len > 0){
+                        gvl = VSETVL(len);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < len / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+
+                        if(i < m){
+				unsigned int gvl_rem = VSETVL(m-i);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
+#else
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
+#endif
+                        }
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                        temp_r2 = VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                        temp_i2 = VFMVFS_FLOAT(v_res);
+                }
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
index 275ee9131..117db7d84 100644
--- a/kernel/riscv64/zhemv_LM_vector.c
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -143,9 +151,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                         temp_r2 = VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                         temp_i2 = VFMVFS_FLOAT(v_res);
                         if(i < m){
 				                gvl = VSETVL(m-i);
@@ -181,9 +189,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
 #endif
 
-                                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                                 temp_r2 += VFMVFS_FLOAT(v_res);
-                                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                                 temp_i2 += VFMVFS_FLOAT(v_res);
                         }
                 }
diff --git a/kernel/riscv64/zhemv_UV_rvv.c b/kernel/riscv64/zhemv_UV_rvv.c
new file mode 100644
index 000000000..ec06622fc
--- /dev/null
+++ b/kernel/riscv64/zhemv_UV_rvv.c
@@ -0,0 +1,199 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f32m1_f32
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VFMVFS_FLOAT __riscv_vfmv_f_s_f64m1_f64
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+        FLOAT_V_T_M1 v_res, v_z0;
+        gvl = VSETVL_MAX;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        BLASLONG m1 = m - offset;
+        a_ptr = a + m1 * lda2;
+        jx = m1 * inc_x2;
+        jy = m1 * inc_y2;
+        ja = m1 * 2;
+        for(j = m1; j < m; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                ix = 0;
+                iy = 0;
+                ia = 0;
+                i = 0;
+                if(j > 0){
+                        gvl = VSETVL(j);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < j / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+
+                        if(i < j){
+				unsigned int gvl_rem = VSETVL(j-i);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl_rem);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl_rem);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl_rem);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl_rem);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl_rem);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl_rem);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl_rem);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl_rem);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl_rem);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl_rem);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl_rem);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl_rem);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFNMSACVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
+#else
+                                vr0 = VFMACCVV_FLOAT_TU(vr0, vx0, va0, gvl_rem);
+                                vr0 = VFNMSACVV_FLOAT_TU(vr0, vx1, va1, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx1, va0, gvl_rem);
+                                vr1 = VFMACCVV_FLOAT_TU(vr1, vx0, va1, gvl_rem);
+#endif
+                        }
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
+                        temp_r2 = VFMVFS_FLOAT(v_res);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
+                        temp_i2 = VFMVFS_FLOAT(v_res);
+                }
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
index 2f46977d4..7c6b63bf3 100644
--- a/kernel/riscv64/zhemv_UV_vector.c
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -27,37 +27,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
 #define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFMULVV_FLOAT vfmul_vv_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f32m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f32m1_f32)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f32m4_f32m1)(v_res, va, vb, gvl)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f32m4_f32m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f32m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f32m4)
+#else
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
 #define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFMULVV_FLOAT vfmul_vv_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFNMSACVV_FLOAT vfnmsac_vv_f64m4
+#define VFMVFS_FLOAT RISCV_RVV(vfmv_f_s_f64m1_f64)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUM_FLOAT(va, vb, gvl) RISCV_RVV(vfredusum_vs_f64m4_f64m1)(v_res, va, vb, gvl)
+#else
+#define VFREDSUM_FLOAT RISCV_RVV(vfredusum_vs_f64m4_f64m1)
+#endif
+#define VFMACCVV_FLOAT RISCV_RVV(vfmacc_vv_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
+#define VFMVVF_FLOAT_M1 RISCV_RVV(vfmv_v_f_f64m1)
+#define VFMULVV_FLOAT RISCV_RVV(vfmul_vv_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
+#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
 #endif
 
 int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
@@ -142,9 +150,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 iy += inc_yv;
                                 ia += inc_av;
                         }
-                        v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                         temp_r2 = VFMVFS_FLOAT(v_res);
-                        v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                        v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                         temp_i2 = VFMVFS_FLOAT(v_res);
                         if(i < j){
 				                gvl = VSETVL(j-i);
@@ -180,9 +188,9 @@ int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, B
                                 vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
 #endif
 
-                                v_res = VFREDSUM_FLOAT(v_res, vr0, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr0, v_z0, gvl);
                                 temp_r2 += VFMVFS_FLOAT(v_res);
-                                v_res = VFREDSUM_FLOAT(v_res, vr1, v_z0, gvl);
+                                v_res = VFREDSUM_FLOAT(vr1, v_z0, gvl);
                                 temp_i2 += VFMVFS_FLOAT(v_res);
                         }
                 }
diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c
index fc1c8b54a..28bb0eda5 100644
--- a/kernel/riscv64/znrm2.c
+++ b/kernel/riscv64/znrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 
-	if (n <= 0 || inc_x <= 0) return(0.0);
+	if (n <= 0 || inc_x == 0) return(0.0);
 
 	inc_x2 = 2 * inc_x;
 
diff --git a/kernel/riscv64/znrm2_rvv.c b/kernel/riscv64/znrm2_rvv.c
new file mode 100644
index 000000000..32f67758a
--- /dev/null
+++ b/kernel/riscv64/znrm2_rvv.c
@@ -0,0 +1,275 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)           __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX          __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T           vfloat32m4_t
+#define FLOAT_V_T_M1        vfloat32m1_t
+#define MASK_T              vbool8_t
+#define VLEV_FLOAT          __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT         __riscv_vlse32_v_f32m4
+#define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f32m4_f32m1_tu
+#define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f32m4_tu
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f32m1
+#define VMFIRSTM            __riscv_vfirst_m_b8
+#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f32m4_f32m1_tu
+#define VFMVFS_FLOAT        __riscv_vfmv_f_s_f32m1_f32
+#define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f32m4_b8
+#define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f32m4
+#define VFABSV_FLOAT        __riscv_vfabs_v_f32m4
+#else
+#define VSETVL(n)           __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX          __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T           vfloat64m4_t
+#define FLOAT_V_T_M1        vfloat64m1_t
+#define MASK_T              vbool16_t
+#define VLEV_FLOAT          __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT         __riscv_vlse64_v_f64m4
+#define VFREDSUM_FLOAT      __riscv_vfredusum_vs_f64m4_f64m1_tu
+#define VFMACCVV_FLOAT_TU   __riscv_vfmacc_vv_f64m4_tu
+#define VFMVVF_FLOAT        __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1     __riscv_vfmv_v_f_f64m1
+#define VMFIRSTM            __riscv_vfirst_m_b16
+#define VFREDMAXVS_FLOAT_TU __riscv_vfredmax_vs_f64m4_f64m1_tu
+#define VFMVFS_FLOAT        __riscv_vfmv_f_s_f64m1_f64
+#define VMFGTVF_FLOAT       __riscv_vmfgt_vf_f64m4_b16
+#define VFDIVVF_FLOAT       __riscv_vfdiv_vf_f64m4
+#define VFABSV_FLOAT        __riscv_vfabs_v_f64m4
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+    FLOAT_V_T vr, v0, v_zero;
+    unsigned int gvl = 0;
+    FLOAT_V_T_M1 v_res, v_z0;
+    gvl = VSETVL_MAX;
+    v_res = VFMVVF_FLOAT_M1(0, gvl);
+    v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+    FLOAT scale = 0.0, ssq = 0.0;
+    MASK_T mask;
+    BLASLONG index = 0;
+    if (inc_x == 1) {
+        BLASLONG n2 = n * 2;
+        gvl = VSETVL(n2);
+        vr = VFMVVF_FLOAT(0, gvl);
+        v_zero = VFMVVF_FLOAT(0, gvl);
+        for (i=0,j=0; i<n2/gvl; i++) {
+            v0 = VLEV_FLOAT(&x[j], gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) { //no elements greater than scale
+                if (scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                //ssq in vector vr
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+            j += gvl;
+        }
+        //ssq in vector vr: vr[0]
+        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+        //total ssq now
+        ssq += VFMVFS_FLOAT(v_res);
+
+        //tail
+        if(j < n2){
+            gvl = VSETVL(n2-j);
+            v0 = VLEV_FLOAT(&x[j], gvl);
+            // fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            // if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) {//no elements greater than scale
+                if(scale != 0.0)
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+            } else { //found greater element
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+            }
+            vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            //ssq in vector vr: vr[0]
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+            //total ssq now
+            ssq += VFMVFS_FLOAT(v_res);
+        }
+    } else {
+        gvl = VSETVL(n);
+        vr = VFMVVF_FLOAT(0, gvl);
+        v_zero = VFMVVF_FLOAT(0, gvl);
+        unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+        int idx = 0, inc_v = inc_x * gvl * 2;
+        for (i=0,j=0; i<n/gvl; i++) {
+            v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+            // fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) { // no elements greater than scale
+                if(scale != 0.0){
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else {//found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                //ssq in vector vr
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+
+            v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) { // no elements greater than scale
+                if(scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                //ssq in vector vr
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+            j += gvl;
+            idx += inc_v;
+        }
+        //ssq in vector vr: vr[0]
+        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+        //total ssq now
+        ssq += VFMVFS_FLOAT(v_res);
+
+        //tail
+        if (j < n) {
+            gvl = VSETVL(n-j);
+            v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if(index == -1) { // no elements greater than scale
+                if(scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+
+            v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+            //fabs(vector)
+            v0 = VFABSV_FLOAT(v0, gvl);
+            //if scale change
+            mask = VMFGTVF_FLOAT(v0, scale, gvl);
+            index = VMFIRSTM(mask, gvl);
+            if (index == -1) {//no elements greater than scale
+                if(scale != 0.0) {
+                    v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                    vr = VFMACCVV_FLOAT_TU(vr, v0, v0, gvl);
+                }
+            } else { // found greater element
+                //ssq in vector vr: vr[0]
+                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+                //total ssq before current vector
+                ssq += VFMVFS_FLOAT(v_res);
+                //find max
+                v_res = VFREDMAXVS_FLOAT_TU(v_res, v0, v_z0, gvl);
+                //update ssq before max_index
+                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
+                //update scale
+                scale = VFMVFS_FLOAT(v_res);
+                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                vr = VFMACCVV_FLOAT_TU(v_zero, v0, v0, gvl);
+            }
+            //ssq in vector vr: vr[0]
+            v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
+            //total ssq now
+            ssq += VFMVFS_FLOAT(v_res);
+        }
+    }
+	return(scale * sqrt(ssq));
+}
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
index 59d0e219d..6ee3be79e 100644
--- a/kernel/riscv64/znrm2_vector.c
+++ b/kernel/riscv64/znrm2_vector.c
@@ -26,264 +26,158 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m4_t
-#define FLOAT_V_T_M1 vfloat32m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f32m1_f32
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VFREDSUM_FLOAT vfredusum_vs_f32m4_f32m1
-#define VFMACCVV_FLOAT vfmacc_vv_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
-#define VFDOTVV_FLOAT vfdot_vv_f32m4
-#define ABS fabsf
-#define MASK_T vbool8_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f32m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f32m4_b8
-#define VMFIRSTM vmfirst_m_b8
-#define VFDIVVF_FLOAT vfdiv_vf_f32m4
-#define VMFLTVF_FLOAT vmflt_vf_f32m4_b8
-#define VFREDMAXVS_FLOAT vfredmax_vs_f32m4_f32m1
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m1
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 64
+#       else
+#               define ELEN 32
+#               define MLEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m4_t
-#define FLOAT_V_T_M1 vfloat64m1_t
-#define VFMVFS_FLOAT vfmv_f_s_f64m1_f64
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VFREDSUM_FLOAT vfredusum_vs_f64m4_f64m1
-#define VFMACCVV_FLOAT vfmacc_vv_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
-#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
-#define VFDOTVV_FLOAT vfdot_vv_f64m4
+#       define LMUL m4
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 16
+#       else
+#               define ELEN 32
+#               define MLEN 8
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define MASK_T          JOIN(vbool,             MLEN,   _t,     _,      _)
+#define VFABS           JOIN(RISCV_RVV(vfabs),     _v_f,   ELEN,   LMUL,   _)
+#define VMFNE           JOIN(RISCV_RVV(vmfne_vf_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFGT           JOIN(RISCV_RVV(vmfgt_vv_f),ELEN,   LMUL,   _b,     MLEN)
+#define VMFEQ           JOIN(RISCV_RVV(vmfeq_vv_f),ELEN,   LMUL,   _b,     MLEN)
+#define VCPOP           JOIN(RISCV_RVV(vcpop),     _m_b,   MLEN,   _,      _)
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDMAX(va, vb, gvl) JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL,   JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl)
+#define VFREDUSUM(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL,  JOIN2(_f, ELEN), m1)(v_res, va, vb, gvl)
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _m)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _m)
+#else
+#define VFREDMAX        JOIN(RISCV_RVV(vfredmax_vs_f),ELEN,LMUL,   JOIN2(_f, ELEN), m1)
+#define VFREDUSUM       JOIN(RISCV_RVV(vfredusum_vs_f),ELEN,LMUL,  JOIN2(_f, ELEN), m1)
+#define VFDIV_M         JOIN(RISCV_RVV(vfdiv),     _vv_f,  ELEN,   LMUL,   _mu)
+#define VFMACC_M        JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _mu)
+#endif
+#define VFIRST          JOIN(RISCV_RVV(vfirst),    _m_b,   MLEN,   _,      _)
+#define VRGATHER        JOIN(RISCV_RVV(vrgather),  _vx_f,  ELEN,   LMUL,   _)
+#define VFDIV           JOIN(RISCV_RVV(vfdiv),     _vf_f,  ELEN,   LMUL,   _)
+#define VFMUL           JOIN(RISCV_RVV(vfmul),     _vv_f,  ELEN,   LMUL,   _)
+#define VFMACC          JOIN(RISCV_RVV(vfmacc),    _vv_f,  ELEN,   LMUL,   _)
+#define VMSOF           JOIN(RISCV_RVV(vmsof),     _m_b,   MLEN,   _,      _)
+#define VMANDN          JOIN(RISCV_RVV(vmandn),    _mm_b,  MLEN,   _,      _)
+#if defined(DOUBLE)
 #define ABS fabs
-#define MASK_T vbool16_t
-#define VFRSUBVF_MASK_FLOAT vfrsub_vf_f64m4_m
-#define VMFGTVF_FLOAT vmfgt_vf_f64m4_b16
-#define VMFIRSTM vmfirst_m_b16
-#define VFDIVVF_FLOAT vfdiv_vf_f64m4
-#define VMFLTVF_FLOAT vmflt_vf_f64m4_b16
-#define VFREDMAXVS_FLOAT vfredmax_vs_f64m4_f64m1
+#else
+#define ABS fabsf
 #endif
 
+#define EXTRACT_FLOAT0_V(v) JOIN(RISCV_RVV(vfmv_f_s_f), ELEN, LMUL, _f, ELEN)(v)
+
+
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-	BLASLONG i=0, j=0;
+        BLASLONG i=0;
 
-	if ( n < 0 )  return(0.0);
-//        if(n == 1) return (ABS(x[0]));
+	if (n <= 0 || inc_x <= 0) return(0.0);
 
-        FLOAT_V_T vr, v0, v_zero;
+        FLOAT_V_T v_ssq, v_scale, v0, v1, v_zero;
         unsigned int gvl = 0;
         FLOAT_V_T_M1 v_res, v_z0;
-        gvl = VSETVL_MAX;
-        v_res = VFMVVF_FLOAT_M1(0, gvl);
-        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
 
-        FLOAT scale = 0.0, ssq = 0.0;
-        MASK_T mask;
-        BLASLONG index = 0;
-        if(inc_x == 1){
-                BLASLONG n2 = n * 2;
-                gvl = VSETVL(n2);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                for(i=0,j=0; i<n2/gvl; i++){
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        }
-                        j += gvl;
-                }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
+        v_res = VFMVVF_FLOAT_M1(0, 1);
+        v_z0 = VFMVVF_FLOAT_M1(0, 1);
 
-                //tail
-                if(j < n2){
-                        gvl = VSETVL(n2-j);
-                        v0 = VLEV_FLOAT(&x[j], gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0)
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                        }
-                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
-                }
-        }else{
-                gvl = VSETVL(n);
-                vr = VFMVVF_FLOAT(0, gvl);
-                v_zero = VFMVVF_FLOAT(0, gvl);
-                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
-                int idx = 0, inc_v = inc_x * gvl * 2;
-                for(i=0,j=0; i<n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        }
+        gvl = VSETVL(n);
+        v_ssq = VFMVVF_FLOAT(0, gvl);
+        v_scale = VFMVVF_FLOAT(0, gvl);
+        v_zero = VFMVVF_FLOAT(0, gvl);
 
-                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
-                                }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                //ssq in vector vr
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
-                        }
-                        j += gvl;
-                        idx += inc_v;
+        unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+        int idx = 0;
+
+        for(i=0; i<n/gvl; i++){
+                v0 = VLSEV_FLOAT( &x[idx], stride_x, gvl );
+                v1 = VLSEV_FLOAT( &x[idx+1], stride_x, gvl );
+                v0 = VFABS( v0, gvl );
+                v1 = VFABS( v1, gvl );
+
+                MASK_T scale_mask0 = VMFGT( v0, v_scale, gvl );
+                MASK_T scale_mask1 = VMFGT( v1, v_scale, gvl );
+                if( VCPOP( scale_mask0, gvl ) + VCPOP( scale_mask1, gvl ) > 0 ){ // scale change?
+                        // find largest element in v0 and v1
+                        v_res = VFREDMAX( v0, v_z0, gvl );
+                        v_res = VFREDMAX( v1, v_res, gvl );
+                        FLOAT const largest_elt = EXTRACT_FLOAT( v_res );
+
+                        v_scale = VFDIV( v_scale, largest_elt, gvl );   // scale/largest_elt
+                        v_scale = VFMUL( v_scale, v_scale, gvl );       // (scale/largest_elt)*(scale/largest_elt)
+                        v_ssq = VFMUL( v_scale, v_ssq, gvl );           // ssq*(scale/largest_elt)*(scale/largest_elt)
+
+                        v_scale = VFMVVF_FLOAT( largest_elt, gvl );     // splated largest_elt becomes new scale
                 }
-                //ssq in vector vr: vr[0]
-                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                //total ssq now
-                ssq += VFMVFS_FLOAT(v_res);
 
-                //tail
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                MASK_T nonzero_mask0 = VMFNE( v0, 0, gvl );
+                MASK_T nonzero_mask1 = VMFNE( v1, 0, gvl );
+                v0 = VFDIV_M( nonzero_mask0, v_zero, v0, v_scale, gvl );
+                v1 = VFDIV_M( nonzero_mask1, v_zero, v1, v_scale, gvl );
+                v_ssq = VFMACC_M( nonzero_mask0, v_ssq, v0, v0, gvl );
+                v_ssq = VFMACC_M( nonzero_mask1, v_ssq, v1, v1, gvl );
+
+                idx += inc_x * gvl * 2;
+        }
+
+        v_res = VFREDUSUM(v_ssq, v_z0, gvl);
+        FLOAT ssq = EXTRACT_FLOAT(v_res);
+        FLOAT scale = EXTRACT_FLOAT0_V(v_scale);
+
+        //finish any tail using scalar ops
+        i*=gvl;
+        if(i<n){
+                i *= inc_x*2;
+                n *= inc_x*2;
+                FLOAT temp;
+                do{
+                        if ( x[i] != 0.0 ){
+                                temp = ABS( x[i] );
+                                if ( scale < temp ){
+                                        ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+                                        scale = temp ;
+                                }else{
+                                        ssq += ( temp / scale ) * ( temp / scale );
                                 }
-                        }else{//found greater element
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         }
 
-                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
-                        //fabs(vector)
-                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
-                        v0 = VFRSUBVF_MASK_FLOAT(mask, v0, v0, 0, gvl);
-                        //if scale change
-                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
-                        index = VMFIRSTM(mask, gvl);
-                        if(index == -1){//no elements greater than scale
-                                if(scale != 0.0){
-                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                        if ( x[i+1] != 0.0 ){
+                                temp = ABS( x[i+1] );
+                                if ( scale < temp ){
+                                        ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+                                        scale = temp ;
+                                }else{
+                                        ssq += ( temp / scale ) * ( temp / scale );
                                 }
-                        }else{//found greater element
-                                //ssq in vector vr: vr[0]
-                                v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                                //total ssq before current vector
-                                ssq += VFMVFS_FLOAT(v_res);
-                                //find max
-                                v_res = VFREDMAXVS_FLOAT(v_res, v0, v_z0, gvl);
-                                //update ssq before max_index
-                                ssq = ssq * (scale/VFMVFS_FLOAT(v_res))*(scale/VFMVFS_FLOAT(v_res));
-                                //update scale
-                                scale = VFMVFS_FLOAT(v_res);
-                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
-                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
                         }
-                        //ssq in vector vr: vr[0]
-                        v_res = VFREDSUM_FLOAT(v_res, vr, v_z0, gvl);
-                        //total ssq now
-                        ssq += VFMVFS_FLOAT(v_res);
-                }
-        }
-	return(scale * sqrt(ssq));
-}
 
+                        i += inc_x*2;
+                }while(i<n);
+        }
 
+        return(scale * sqrt(ssq));
+}
diff --git a/kernel/riscv64/zrot_rvv.c b/kernel/riscv64/zrot_rvv.c
new file mode 100644
index 000000000..1d5390684
--- /dev/null
+++ b/kernel/riscv64/zrot_rvv.c
@@ -0,0 +1,228 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT              __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT             __riscv_vsse32_v_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT              __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT             __riscv_vsse64_v_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+
+    if (n <= 0) return(0);
+
+    FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
+    FLOAT_VX2_T vxx2, vyx2, vtx2;
+
+    if (inc_x == 0 && inc_y == 0) {
+        BLASLONG i=0;
+        BLASLONG ix=0,iy=0;
+        FLOAT temp[2];
+        BLASLONG inc_x2;
+        BLASLONG inc_y2;
+
+        inc_x2 = 2 * inc_x ;
+        inc_y2 = 2 * inc_y ;
+
+        while(i < n)
+        {
+            temp[0]   = c*x[ix]   + s*y[iy] ;
+            temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+            y[iy]     = c*y[iy]   - s*x[ix] ;
+            y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+            x[ix]     = temp[0] ;
+            x[ix+1]   = temp[1] ;
+
+            ix += inc_x2 ;
+            iy += inc_y2 ;
+            i++ ;
+        }
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSEG_FLOAT(x, vtx2, vl);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    } else if (inc_x == 1){
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSEG_FLOAT(x, vtx2, vl);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+
+    } else if (inc_y == 1){
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSSEG_FLOAT(x, stride_x, vtx2, vl);
+            VSSEG_FLOAT(y, vyx2, vl);
+        }
+
+    } else {
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            vx0 = VGET_VX2(vxx2, 0);
+            vx1 = VGET_VX2(vxx2, 1);
+            vy0 = VGET_VX2(vyx2, 0);
+            vy1 = VGET_VX2(vyx2, 1);
+
+            vt0 = VFMULVF_FLOAT(vx0, c, vl);
+            vt0 = VFMACCVF_FLOAT(vt0, s, vy0, vl);
+            vt1 = VFMULVF_FLOAT(vx1, c, vl);
+            vt1 = VFMACCVF_FLOAT(vt1, s, vy1, vl);
+            vy0 = VFMULVF_FLOAT(vy0, c, vl);
+            vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, vl);
+            vy1 = VFMULVF_FLOAT(vy1, c, vl);
+            vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, vl);
+
+            vtx2 = VSET_VX2(vtx2, 0, vt0);
+            vtx2 = VSET_VX2(vtx2, 1, vt1);
+            vyx2 = VSET_VX2(vyx2, 0, vy0);
+            vyx2 = VSET_VX2(vyx2, 1, vy1);
+
+            VSSSEG_FLOAT(x, stride_x, vtx2, vl);
+            VSSSEG_FLOAT(y, stride_y, vyx2, vl);
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
index 2fdd8135a..50751b343 100644
--- a/kernel/riscv64/zrot_vector.c
+++ b/kernel/riscv64/zrot_vector.c
@@ -27,27 +27,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
-#define VLEV_FLOAT vle_v_f32m4
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSEV_FLOAT vse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
+#define VLEV_FLOAT RISCV_RVV(vle32_v_f32m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSEV_FLOAT RISCV_RVV(vse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
-#define VLEV_FLOAT vle_v_f64m4
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSEV_FLOAT vse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
+#define VLEV_FLOAT RISCV_RVV(vle64_v_f64m4)
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSEV_FLOAT RISCV_RVV(vse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
 #endif
 
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
@@ -59,7 +59,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
         unsigned int gvl = 0;
 
         FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
-        gvl = VSETVL(n);
+        gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
         BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
         BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
         BLASLONG inc_xv = inc_x * 2 * gvl;
diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c
index 0521aaa0b..b2d537d04 100644
--- a/kernel/riscv64/zscal.c
+++ b/kernel/riscv64/zscal.c
@@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 			else
 			{
 				temp = - da_i * x[ip+1] ;
+				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				x[ip+1] = da_i * x[ip]  ;
 			}
 		}
diff --git a/kernel/riscv64/zscal_rvv.c b/kernel/riscv64/zscal_rvv.c
new file mode 100644
index 000000000..ae79d9f9d
--- /dev/null
+++ b/kernel/riscv64/zscal_rvv.c
@@ -0,0 +1,112 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VSET_VX2                __riscv_vset_v_f32m4_f32m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VSET_VX2                __riscv_vset_v_f64m4_f64m4x2
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m4
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m4
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+
+    if((n <= 0) || (inc_x <= 0)) return(0);
+
+    FLOAT_V_T vt, vr, vi;
+    BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+    size_t vlmax = VSETVL_MAX;
+    FLOAT_VX2_T vx2;
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSEG_FLOAT(x, vl);
+            vr = VGET_VX2(vx2, 0);
+            vi = VGET_VX2(vx2, 1);
+
+            vt = VFMULVF_FLOAT(vr, da_r, vl);
+            vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
+            vi = VFMULVF_FLOAT(vi, da_r, vl);
+            vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
+
+            vx2 = VSET_VX2(vx2, 0, vt);
+            vx2 = VSET_VX2(vx2, 1, vi);
+            VSSEG_FLOAT(x, vx2, vl);
+        }
+
+    } else {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vr = VGET_VX2(vx2, 0);
+            vi = VGET_VX2(vx2, 1);
+
+            vt = VFMULVF_FLOAT(vr, da_r, vl);
+            vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
+            vi = VFMULVF_FLOAT(vi, da_r, vl);
+            vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);
+
+            vx2 = VSET_VX2(vx2, 0, vt);
+            vx2 = VSET_VX2(vx2, 1, vi);
+            VSSSEG_FLOAT(x, stride_x, vx2, vl);
+        }
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
index 64323aa3a..536bbdf73 100644
--- a/kernel/riscv64/zscal_vector.c
+++ b/kernel/riscv64/zscal_vector.c
@@ -27,25 +27,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m4(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e32m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e32m1)()
 #define FLOAT_V_T vfloat32m4_t
-#define VLSEV_FLOAT vlse_v_f32m4
-#define VSSEV_FLOAT vsse_v_f32m4
-#define VFMACCVF_FLOAT vfmacc_vf_f32m4
-#define VFMULVF_FLOAT vfmul_vf_f32m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f32m4
-#define VFMVVF_FLOAT vfmv_v_f_f32m4
+#define VLSEV_FLOAT RISCV_RVV(vlse32_v_f32m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse32_v_f32m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f32m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f32m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f32m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f32m4)
 #else
-#define VSETVL(n) vsetvl_e64m4(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
+#define VSETVL(n) RISCV_RVV(vsetvl_e64m4)(n)
+#define VSETVL_MAX RISCV_RVV(vsetvlmax_e64m1)()
 #define FLOAT_V_T vfloat64m4_t
-#define VLSEV_FLOAT vlse_v_f64m4
-#define VSSEV_FLOAT vsse_v_f64m4
-#define VFMACCVF_FLOAT vfmacc_vf_f64m4
-#define VFMULVF_FLOAT vfmul_vf_f64m4
-#define VFNMSACVF_FLOAT vfnmsac_vf_f64m4
-#define VFMVVF_FLOAT vfmv_v_f_f64m4
+#define VLSEV_FLOAT RISCV_RVV(vlse64_v_f64m4)
+#define VSSEV_FLOAT RISCV_RVV(vsse64_v_f64m4)
+#define VFMACCVF_FLOAT RISCV_RVV(vfmacc_vf_f64m4)
+#define VFMULVF_FLOAT RISCV_RVV(vfmul_vf_f64m4)
+#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
+#define VFMVVF_FLOAT RISCV_RVV(vfmv_v_f_f64m4)
 #endif
 
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@@ -59,84 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 
         unsigned int gvl = 0;
         FLOAT_V_T vt, v0, v1;
-        if(da_r == 0.0 && da_i == 0.0){
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                vt = VFMVVF_FLOAT(0.0, gvl);
-                for(i=0,j=0; i < n/(gvl*2); i++){
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl);
-
-                        j += gvl*2;
-                        ix += inc_xv*2;
-                }
-                for(; j < n; ){
-                        gvl = VSETVL(n-j);
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
-                        j += gvl;
-                        ix += inc_x * 2 * gvl;
-                }
-        }else if(da_r == 0.0){
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                for(i=0,j=0; i < n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
-                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-
-                        j += gvl;
-                        ix += inc_xv;
-                }
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
-                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-                }
-        }else if(da_i == 0.0){
-                gvl = VSETVL(n);
-                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
-                BLASLONG inc_xv = inc_x * 2 * gvl;
-                for(i=0,j=0; i < n/gvl; i++){
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
-                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-
-                        j += gvl;
-                        ix += inc_xv;
-                }
-                if(j < n){
-                        gvl = VSETVL(n-j);
-                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
-                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
-
-                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
-                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
-
-                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
-                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
-                }
-        }else{
+        {
                 gvl = VSETVL(n);
                 BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * 2 * gvl;
diff --git a/kernel/riscv64/zsum_rvv.c b/kernel/riscv64/zsum_rvv.c
new file mode 100644
index 000000000..489188bd5
--- /dev/null
+++ b/kernel/riscv64/zsum_rvv.c
@@ -0,0 +1,107 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m4()
+#define FLOAT_V_T               vfloat32m4_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VGET_VX2                __riscv_vget_v_f32m4x2_f32m4
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f32m4_tu
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m4()
+#define FLOAT_V_T               vfloat64m4_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VGET_VX2                __riscv_vget_v_f64m4x2_f64m4
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#define VFADDVV_FLOAT_TU        __riscv_vfadd_vv_f64m4_tu
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    FLOAT sumf = 0.0;
+    if (n <= 0 || inc_x <= 0) return(sumf);
+
+    FLOAT_V_T v0, v1;
+    FLOAT_VX2_T vx2;
+    size_t vlmax = VSETVL_MAX; 
+    FLOAT_V_T v_sum = VFMVVF_FLOAT(0, vlmax);
+
+    if(inc_x == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSEG_FLOAT(x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
+        }
+
+    } else {
+
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
+            vl = VSETVL(n);
+
+            vx2 = VLSSEG_FLOAT(x, stride_x, vl);
+
+            v0 = VGET_VX2(vx2, 0);
+            v1 = VGET_VX2(vx2, 1);
+
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v0, vl);
+            v_sum = VFADDVV_FLOAT_TU(v_sum, v_sum, v1, vl);
+        }
+
+    }
+
+    FLOAT_V_T_M1 v_res = VFMVVF_FLOAT_M1(0, vlmax);
+    v_res = VFREDSUMVS_FLOAT(v_sum, v_res, vlmax);
+    sumf += VFMVFS_FLOAT_M1(v_res);
+
+    return(sumf);
+}
diff --git a/kernel/riscv64/zsum_vector.c b/kernel/riscv64/zsum_vector.c
new file mode 100644
index 000000000..ca0b02b5c
--- /dev/null
+++ b/kernel/riscv64/zsum_vector.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b32
+#       else
+#               define ELEN 32
+#               define MLEN _b16
+#       endif
+#else
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN _b8
+#       else
+#               define ELEN 32
+#               define MLEN _b4
+#       endif
+#endif
+
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_T_M1    JOIN(vfloat,            ELEN,   m1,     _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+#define VFADDVV_FLOAT   JOIN(RISCV_RVV(vfadd),     _vv_f,  ELEN,   LMUL,   _)
+#define VMFLTVF_FLOAT   JOIN(RISCV_RVV(vmflt),     _vf_f,  ELEN,   LMUL,   MLEN)
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_zero,v_sum;
+        FLOAT_V_T_M1 v_res;
+        v_res = VFMVVF_FLOAT_M1(0, 1);
+
+        if(inc_x == 1){
+                BLASLONG n2 = n * 2;
+                gvl = VSETVL(n2);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n2/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n2/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+                for(;j<n2;){
+                        gvl = VSETVL(n2-j);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v_res = VFREDSUMVS_FLOAT(v0, v_res, gvl);
+                        j += gvl;
+                }
+        }else{
+                gvl = VSETVL(n);
+                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+                v_zero = VFMVVF_FLOAT(0, gvl);
+
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                v_sum = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                if(j<n){
+                        gvl = VSETVL(n-j);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        v_sum = VFADDVV_FLOAT(v0, v1, gvl);
+                        v_res = VFREDSUMVS_FLOAT(v_sum, v_res, gvl);
+                }
+        }
+        asumf = EXTRACT_FLOAT(v_res);
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c
index ae4760ae0..df1402b94 100644
--- a/kernel/riscv64/zswap.c
+++ b/kernel/riscv64/zswap.c
@@ -45,7 +45,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 
-	if ( n < 0     )  return(0);
+	if ( n <= 0     )  return(0);
 
 	inc_x2 = 2 * inc_x;
 	inc_y2 = 2 * inc_y;
diff --git a/kernel/riscv64/zswap_rvv.c b/kernel/riscv64/zswap_rvv.c
new file mode 100644
index 000000000..c2adf5e05
--- /dev/null
+++ b/kernel/riscv64/zswap_rvv.c
@@ -0,0 +1,156 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m4(n)
+#define FLOAT_VX2_T             vfloat32m4x2_t
+#define VLSEG_FLOAT             __riscv_vlseg2e32_v_f32m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e32_v_f32m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e32_v_f32m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e32_v_f32m4x2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m4(n)
+#define FLOAT_VX2_T             vfloat64m4x2_t
+#define VLSEG_FLOAT             __riscv_vlseg2e64_v_f64m4x2
+#define VLSSEG_FLOAT            __riscv_vlsseg2e64_v_f64m4x2
+#define VSSEG_FLOAT             __riscv_vsseg2e64_v_f64m4x2
+#define VSSSEG_FLOAT            __riscv_vssseg2e64_v_f64m4x2
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+
+    if (n <= 0) return(0);
+
+    FLOAT_VX2_T vxx2, vyx2;
+
+    if (inc_x == 0 && inc_y == 0) {
+        if (n & 1) {
+            FLOAT temp[2];
+            temp[0] = x[0];
+            temp[1] = x[1];
+            x[0] = y[0];
+            x[1] = y[1];
+            y[0] = temp[0];
+            y[1] = temp[1];
+        }
+        else {
+            return 0;
+        }
+    }
+    else if(inc_x == 0) {
+        FLOAT temp[2];
+        temp[0] = x[0];
+        temp[1] = x[1];
+        x[0] = y[(n - 1) * inc_y * 2];
+        x[0] = y[(n - 1) * inc_y * 2 + 1];
+        FLOAT* ptr = y + (n - 1) * inc_y * 2;   // start from the last one
+        BLASLONG stride_y = (0 - inc_y) * sizeof(FLOAT) * 2; // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_y * 2) {
+            vl = VSETVL(m);
+            vyx2 = VLSSEG_FLOAT(ptr - 2, stride_y, vl);
+            VSSSEG_FLOAT(ptr, stride_y, vyx2, vl);
+        }
+        y[0] = temp[0];
+        y[1] = temp[1];
+    }
+    else if(inc_y == 0) {
+        FLOAT temp[2];
+        temp[0] = y[0];
+        temp[1] = y[1];
+        y[0] = x[(n - 1) * inc_x * 2];
+        y[0] = x[(n - 1) * inc_x * 2 + 1];
+        FLOAT* ptr = x + (n - 1) * inc_x * 2;   // start from the last one
+        BLASLONG stride_x = (0 - inc_x) * sizeof(FLOAT) * 2; // reverse
+        BLASLONG m = n - 1;
+        for (size_t vl; m > 0; m -= vl * 2, ptr -= vl*inc_x * 2) {
+            vl = VSETVL(m);
+            vxx2 = VLSSEG_FLOAT(ptr - 2, stride_x, vl);
+            VSSSEG_FLOAT(ptr, stride_x, vxx2, vl);
+        }
+        x[0] = temp[0];
+        x[1] = temp[1];
+    }
+    else if(inc_x == 1 && inc_y == 1) {
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            VSSEG_FLOAT(y, vxx2, vl);
+            VSSEG_FLOAT(x, vyx2, vl);
+        }
+
+    } else if (inc_x == 1){
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSEG_FLOAT(x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            VSSSEG_FLOAT(y, stride_y, vxx2, vl);
+            VSSEG_FLOAT(x, vyx2, vl);
+        }
+
+    } else if (inc_y == 1){
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSEG_FLOAT(y, vl);
+
+            VSSEG_FLOAT(y, vxx2, vl);
+            VSSSEG_FLOAT(x, stride_x, vyx2, vl);
+        }
+
+    } else {
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2, y += vl*inc_y*2) {
+            vl = VSETVL(n);
+
+            vxx2 = VLSSEG_FLOAT(x, stride_x, vl);
+            vyx2 = VLSSEG_FLOAT(y, stride_y, vl);
+
+            VSSSEG_FLOAT(y, stride_y, vxx2, vl);
+            VSSSEG_FLOAT(x, stride_x, vyx2, vl);
+        }
+
+    }
+
+    return(0);
+}
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
index 7550294b5..f2734c4a9 100644
--- a/kernel/riscv64/zswap_vector.c
+++ b/kernel/riscv64/zswap_vector.c
@@ -27,35 +27,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 #include <stdio.h>
-#if !defined(DOUBLE)
-#define VSETVL(n) vsetvl_e32m8(n)
-#define VSETVL_MAX vsetvlmax_e32m1()
-#define FLOAT_V_T vfloat32m8_t
-#define VLEV_FLOAT vle_v_f32m8
-#define VLSEV_FLOAT vlse_v_f32m8
-#define VSEV_FLOAT vse_v_f32m8
-#define VSSEV_FLOAT vsse_v_f32m8
+
+#ifdef RISCV64_ZVL256B
+#       define LMUL m2
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 64
+#       else
+#               define ELEN 32
+#               define MLEN 32
+#       endif
 #else
-#define VSETVL(n) vsetvl_e64m8(n)
-#define VSETVL_MAX vsetvlmax_e64m1()
-#define FLOAT_V_T vfloat64m8_t
-#define VLEV_FLOAT vle_v_f64m8
-#define VLSEV_FLOAT vlse_v_f64m8
-#define VSEV_FLOAT vse_v_f64m8
-#define VSSEV_FLOAT vsse_v_f64m8
+#       define LMUL m8
+#       if defined(DOUBLE)
+#               define ELEN 64
+#               define MLEN 16
+#       else
+#               define ELEN 32
+#               define MLEN 8
+#       endif
 #endif
 
+#define _
+#define JOIN2_X(x, y) x ## y
+#define JOIN2(x, y) JOIN2_X(x, y)
+#define JOIN(v, w, x, y, z) JOIN2( JOIN2( JOIN2( JOIN2( v, w ), x), y), z)
+
+#define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
+#define FLOAT_V_T       JOIN(vfloat,            ELEN,   LMUL,   _t,     _)
+#define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
+#define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
+#define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
+
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i = 0, j = 0;
 	BLASLONG ix = 0,iy = 0;
         BLASLONG stride_x, stride_y;
         FLOAT_V_T vx0, vx1, vy0, vy1;
-        unsigned int gvl = 0;
+        unsigned int gvl = VSETVL((inc_x != 0 && inc_y != 0) ? n : 1);
+        if( inc_x == 0 && inc_y == 0 ) { n = n & 1; }
 
-	if (n < 0)  return(0);
+	if (n <= 0)  return(0);
         if(inc_x == 1 && inc_y == 1){
-                gvl = VSETVL(n);
                 BLASLONG n2 = n * 2;
                 if(gvl <= n2/2){
                         for(i=0,j=0; i<n2/(2*gvl); i++){
@@ -80,7 +95,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dumm
                         j += gvl;
                 }
         }else{
-                gvl = VSETVL(n);
                 stride_x = inc_x * 2 * sizeof(FLOAT);
                 stride_y = inc_y * 2 * sizeof(FLOAT);
                 BLASLONG inc_xv = inc_x * gvl * 2;
diff --git a/kernel/riscv64/zsymm_lcopy_rvv_v1.c b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
new file mode 100644
index 000000000..f4d806190
--- /dev/null
+++ b/kernel/riscv64/zsymm_lcopy_rvv_v1.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+
+    FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posX * 2 + posY * lda * 2;
+        ao2 = a + posY * 2 + (posX) * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+
+            va2x2 = VLSSEG2_FLOAT(ao2, stride_lda, vl);
+            va1x2 = VLSEG2_FLOAT(ao1, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
+
+            b   += vl * 2;
+            ao1 += lda * 2;
+            ao2 += 2;
+        }
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/zsymm_ucopy_rvv_v1.c b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
new file mode 100644
index 000000000..069551bb0
--- /dev/null
+++ b/kernel/riscv64/zsymm_ucopy_rvv_v1.c
@@ -0,0 +1,121 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define INT_V_T                 vint32m2_t
+#define VID_V_INT               __riscv_vid_v_i32m2
+#define VADD_VX_INT             __riscv_vadd_vx_i32m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i32m2_b16
+#define VBOOL_T                 vbool16_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define INT_V_T                 vint64m2_t
+#define VID_V_INT               __riscv_vid_v_i64m2
+#define VADD_VX_INT             __riscv_vadd_vx_i64m2
+#define VMSGT_VX_INT            __riscv_vmsgt_vx_i64m2_b32
+#define VBOOL_T                 vbool32_t
+#define VMERGE_VVM_FLOAT        __riscv_vmerge_vvm_f64m2
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b)
+{
+    BLASLONG i, js, offset;
+
+    FLOAT *ao1, *ao2;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda * 2;
+    
+    FLOAT_V_T vb0, vb1, va10, va11, va20, va21;
+    FLOAT_VX2_T va1x2, va2x2, vbx2;
+    VBOOL_T vbool;
+    INT_V_T vindex_max, vindex;
+
+
+    size_t vl = VSETVL_MAX;
+    vindex_max   = VID_V_INT(vl);
+
+    for (js = n; js > 0; js -= vl, posX += vl) {
+        vl = VSETVL(js);
+        offset = posX - posY;
+
+        ao1 = a + posY * 2 + (posX + 0) * lda * 2;
+        ao2 = a + posX * 2 + 0 + posY * lda * 2;
+
+        for (i = m; i > 0; i--, offset--) {
+            va1x2 = VLSSEG2_FLOAT(ao1, stride_lda, vl);
+            va2x2 = VLSEG2_FLOAT(ao2, vl);
+
+            va20 = VGET_VX2(va2x2, 0);
+            va21 = VGET_VX2(va2x2, 1);
+            va10 = VGET_VX2(va1x2, 0);
+            va11 = VGET_VX2(va1x2, 1);
+
+            vindex = VADD_VX_INT(vindex_max, offset, vl);
+            vbool  = VMSGT_VX_INT(vindex, 0, vl);
+
+            vb0 =  VMERGE_VVM_FLOAT(va20, va10, vbool, vl);
+            vb1 =  VMERGE_VVM_FLOAT(va21, va11, vbool, vl);
+
+            vbx2 = VSET_VX2(vbx2, 0, vb0);
+            vbx2 = VSET_VX2(vbx2, 1, vb1);
+            VSSEG2_FLOAT(b, vbx2, vl);
+
+            b   += vl * 2;
+            ao1 += 2;
+            ao2 += lda * 2;
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/zsymv_L_rvv.c b/kernel/riscv64/zsymv_L_rvv.c
new file mode 100644
index 000000000..cefdea7f6
--- /dev/null
+++ b/kernel/riscv64/zsymv_L_rvv.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
+#define VFNEGV_FLOAT __riscv_vfneg_v_f32mf4
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
+#define VFNEGV_FLOAT __riscv_vfneg_v_f64mf4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
+	  FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1[2];
+        FLOAT temp2[2];
+        FLOAT *a_ptr = a;
+        BLASLONG gvl = VSETVL_MAX;
+        FLOAT_V_T_M1 v_res, v_z0;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+        FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv, len;
+
+        stride_x = 2 * inc_x * sizeof(FLOAT);
+        stride_y = 2 * inc_y * sizeof(FLOAT);
+        jx = 0;
+        jy = 0;
+        for (j=0; j<offset; j++)
+        {
+                temp1[0] = alpha_r * x[2 * jx] - alpha_i * x[2 * jx + 1];
+                temp1[1] = alpha_r * x[2 * jx + 1] + alpha_i * x[2 * jx];
+                temp2[0] = 0;
+                temp2[1] = 0;
+
+		y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1];
+		y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1];
+
+                ix = jx + inc_x;
+                iy = jy + inc_y;
+                i = j + 1;
+                len = m - i;
+                if(len > 0){
+                        gvl = VSETVL(len);
+                        inc_xv = inc_x * gvl;
+                        inc_yv = inc_y * gvl;
+                        vr_r = VFMVVF_FLOAT(0, gvl);
+                        vr_i = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < len / gvl; k++){
+                                va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
+                                va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+
+                                vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
+                                vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
+                                
+                                vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
+                                vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                
+                                VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
+                                VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+
+                                vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
+                                vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
+                                vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl);
+                                vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
+                                vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl);
+                                vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+
+                        if(i < m){
+                                unsigned int gvl_rem = VSETVL(m-i);
+                                vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem);
+                                vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem);
+                                va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem);
+                                va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem);
+
+                                vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem);
+                                vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem);
+                                vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem);
+                                
+                                VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem);
+                                VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem);
+
+                                vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem);
+                                vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem);
+                                vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem);
+                                vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem);
+                                vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem);
+                                vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem);
+                                
+                        }
+                        v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                        temp2[0] = VFMVFS_FLOAT_M1(v_res);
+                        v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                        temp2[1] = VFMVFS_FLOAT_M1(v_res);
+                }
+                y[2 * jy] += alpha_r * temp2[0] - alpha_i * temp2[1];
+                y[2 * jy + 1] += alpha_r * temp2[1] + alpha_i * temp2[0];
+
+                jx    += inc_x;
+                jy    += inc_y;
+                a_ptr += 2 * lda;
+        }
+                
+        return(0);
+}
+
diff --git a/kernel/riscv64/zsymv_U_rvv.c b/kernel/riscv64/zsymv_U_rvv.c
new file mode 100644
index 000000000..67b5a649c
--- /dev/null
+++ b/kernel/riscv64/zsymv_U_rvv.c
@@ -0,0 +1,177 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define VSETVL(n) __riscv_vsetvl_e32m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T vfloat32m4_t
+#define FLOAT_V_T_M1 vfloat32m1_t
+#define VLEV_FLOAT __riscv_vle32_v_f32m4
+#define VLSEV_FLOAT __riscv_vlse32_v_f32m4
+#define VSEV_FLOAT __riscv_vse32_v_f32m4
+#define VSSEV_FLOAT __riscv_vsse32_v_f32m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f32m4_f32m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f32m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f32m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f32m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f32m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n) __riscv_vsetvl_e64m4(n)
+#define VSETVL_MAX __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T vfloat64m4_t
+#define FLOAT_V_T_M1 vfloat64m1_t
+#define VLEV_FLOAT __riscv_vle64_v_f64m4
+#define VLSEV_FLOAT __riscv_vlse64_v_f64m4
+#define VSEV_FLOAT __riscv_vse64_v_f64m4
+#define VSSEV_FLOAT __riscv_vsse64_v_f64m4
+#define VFREDSUM_FLOAT __riscv_vfredusum_vs_f64m4_f64m1
+#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m4
+#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m4
+#define VFMACCVV_FLOAT_TU __riscv_vfmacc_vv_f64m4_tu
+#define VFNMSACVV_FLOAT_TU __riscv_vfnmsac_vv_f64m4_tu
+#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m4
+#define VFNMSACVF_FLOAT __riscv_vfnmsac_vf_f64m4
+#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m4
+#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
+#define VFMULVV_FLOAT __riscv_vfmul_vv_f64m4
+#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i,
+	  FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1[2];
+        FLOAT temp2[2];
+        FLOAT *a_ptr = a;
+        BLASLONG gvl = VSETVL_MAX;
+        FLOAT_V_T_M1 v_res, v_z0;
+        v_res = VFMVVF_FLOAT_M1(0, gvl);
+        v_z0 = VFMVVF_FLOAT_M1(0, gvl);
+
+
+        FLOAT_V_T va_r, va_i, vx_r, vx_i, vy_r, vy_i, vr_r, vr_i;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        BLASLONG m1 = m - offset;
+        jx = m1 * inc_x;
+        jy = m1 * inc_y;
+        a_ptr += m1 * lda;
+        stride_x = 2 * inc_x * sizeof(FLOAT);
+        stride_y = 2 * inc_y * sizeof(FLOAT);
+        for (j=m1; j<m; j++)
+        {
+                temp1[0] = alpha_r * x[2 * jx] - alpha_i * x[2 * jx + 1];
+                temp1[1] = alpha_r * x[2 * jx + 1] + alpha_i * x[2 * jx];
+                temp2[0] = 0;
+                temp2[1] = 0;
+                if(j > 0){
+                         ix = 0;
+                         iy = 0;
+                         i = 0;
+                         gvl = VSETVL(j);
+                         inc_xv = inc_x * gvl;
+                         inc_yv = inc_y * gvl;
+                         vr_r = VFMVVF_FLOAT(0, gvl);
+                         vr_i = VFMVVF_FLOAT(0, gvl);
+                         for(k = 0; k < j / gvl; k++){
+                                 va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl);
+                                 va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl);
+
+                                 vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl);
+                                 vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl);
+                                 
+                                 vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl);
+                                 vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl);
+                                
+                                 VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl);
+                                 VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl);
+
+                                 vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl);
+                                 vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl);
+                                 vr_r = VFMACCVV_FLOAT(vr_r, vx_r, va_r, gvl);
+                                 vr_r = VFNMSACVV_FLOAT(vr_r, vx_i, va_i, gvl);
+                                 vr_i = VFMACCVV_FLOAT(vr_i, vx_r, va_i, gvl);
+                                 vr_i = VFMACCVV_FLOAT(vr_i, vx_i, va_r, gvl);
+
+                                 i += gvl;
+                                 ix += inc_xv;
+                                 iy += inc_yv;
+                         }
+
+                         if(i < j){
+                                 unsigned int gvl_rem = VSETVL(j-i);
+                                 vy_r = VLSEV_FLOAT(&y[2 * iy], stride_y, gvl_rem);
+                                 vy_i = VLSEV_FLOAT(&y[2 * iy + 1], stride_y, gvl_rem);
+                                 
+                                 va_r = VLSEV_FLOAT(&a_ptr[2 * i], 2 * sizeof(FLOAT), gvl_rem);
+                                 va_i = VLSEV_FLOAT(&a_ptr[2 * i + 1], 2 * sizeof(FLOAT), gvl_rem);
+
+                                 vy_r = VFMACCVF_FLOAT(vy_r, temp1[0], va_r, gvl_rem);
+                                 vy_r = VFNMSACVF_FLOAT(vy_r, temp1[1], va_i, gvl_rem);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[0], va_i, gvl_rem);
+                                 vy_i = VFMACCVF_FLOAT(vy_i, temp1[1], va_r, gvl_rem);
+                                
+                                 VSSEV_FLOAT(&y[2 * iy], stride_y, vy_r, gvl_rem);
+                                 VSSEV_FLOAT(&y[2 * iy + 1], stride_y, vy_i, gvl_rem);
+
+                                 vx_r = VLSEV_FLOAT(&x[2 * ix], stride_x, gvl_rem);
+                                 vx_i = VLSEV_FLOAT(&x[2 * ix + 1], stride_x, gvl_rem);
+                                 vr_r = VFMACCVV_FLOAT_TU(vr_r, vx_r, va_r, gvl_rem);
+                                 vr_r = VFNMSACVV_FLOAT_TU(vr_r, vx_i, va_i, gvl_rem);
+                                 vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_r, va_i, gvl_rem);
+                                 vr_i = VFMACCVV_FLOAT_TU(vr_i, vx_i, va_r, gvl_rem);
+                                
+                         }
+                         v_res = VFREDSUM_FLOAT(vr_r, v_z0, gvl);
+                         temp2[0] = VFMVFS_FLOAT_M1(v_res);
+                         v_res = VFREDSUM_FLOAT(vr_i, v_z0, gvl);
+                         temp2[1] = VFMVFS_FLOAT_M1(v_res);
+                }
+
+		 y[2 * jy] += temp1[0] * a_ptr[j * 2] - temp1[1] * a_ptr[j * 2 + 1] + alpha_r * temp2[0] - alpha_i * temp2[1];
+		 y[2 * jy + 1] += temp1[1] * a_ptr[j * 2] + temp1[0] * a_ptr[j * 2 + 1] + alpha_r * temp2[1] + alpha_i * temp2[0];
+
+                a_ptr += 2 * lda;
+                jx    += inc_x;
+                jy    += inc_y;
+        }
+        
+        return(0);
+}
+
diff --git a/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c
new file mode 100644
index 000000000..d7d5e5fea
--- /dev/null
+++ b/kernel/riscv64/ztrmm_kernel_4x4_zvl128b.c
@@ -0,0 +1,805 @@
+/*
+
+AUTOGENERATED KERNEL
+Script: ./kernel/riscv64/generate_kernel.py
+Settings:
+ LMUL=2
+ M=4
+ M_tail_scalar_from=2
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl128b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=128
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=2
+ VFMACC='__riscv_vfmacc_vf_f64m2'
+ VFMUL='__riscv_vfmul_vf_f64m2'
+ VLEV='__riscv_vle64_v_f64m2'
+ VLSEV='__riscv_vlse64_v_f64m2'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m2'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m2'
+ VSETVL='__riscv_vsetvl_e64m2'
+ VSEV='__riscv_vse64_v_f64m2'
+ VSSEV='__riscv_vsse64_v_f64m2'
+ acc_vector_t='vfloat64m2_t'
+ output='ztrmm_kernel_4x4_zvl128b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m2_t'
+
+*/
+
+#include "common.h"
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define S0 1
+#define S1 -1
+#define S2 1
+#define S3 1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfmacc
+#endif
+#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define S0 1
+#define S1 1
+#define S2 1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfmsac
+#endif
+#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define S0 1
+#define S1 1
+#define S2 -1
+#define S3 1
+#define VFMACC_RR __riscv_vfmacc
+#define VFMACC_RI __riscv_vfnmsac
+#endif
+#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define S0 1
+#define S1 -1
+#define S2 -1
+#define S3 -1
+#define VFMACC_RR __riscv_vfmsac
+#define VFMACC_RI __riscv_vfnmacc
+#endif
+
+#if defined(LEFT) != defined(TRANSA)
+#define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT *A, FLOAT *B, FLOAT *C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+    // -- MAIN PASS
+
+    for (BLASLONG j = 0; j < N / 4; j += 1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m2(4);
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            double B2r = B[bi + 2 * 2 + 0];
+            double B2i = B[bi + 2 * 2 + 1];
+            double B3r = B[bi + 3 * 2 + 0];
+            double B3i = B[bi + 3 * 2 + 1];
+            bi += 4 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 6 vector registers for temporaries
+            // performing 2 operations between reuses of temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+            tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+            vfloat64m2_t ACC2r = tmp0r;
+            vfloat64m2_t ACC2i = tmp0i;
+            vfloat64m2_t ACC3r = tmp1r;
+            vfloat64m2_t ACC3i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                B2r = B[bi + 2 * 2 + 0];
+                B2i = B[bi + 2 * 2 + 1];
+                B3r = B[bi + 3 * 2 + 0];
+                B3i = B[bi + 3 * 2 + 1];
+                bi += 4 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B3i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B3i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B3r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B3r, A0i, gvl);
+                ACC2r = __riscv_vfadd(ACC2r, tmp0r, gvl);
+                ACC2i = __riscv_vfadd(ACC2i, tmp0i, gvl);
+                ACC3r = __riscv_vfadd(ACC3r, tmp1r, gvl);
+                ACC3i = __riscv_vfadd(ACC3i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            vfloat64m2_t C2r = __riscv_vfmul(ACC2r, alphar, gvl);
+            vfloat64m2_t C2i = __riscv_vfmul(ACC2i, alphar, gvl);
+            vfloat64m2_t C3r = __riscv_vfmul(ACC3r, alphar, gvl);
+            vfloat64m2_t C3i = __riscv_vfmul(ACC3i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            C2r = __riscv_vfnmsac(C2r, alphai, ACC2i, gvl);
+            C2i = __riscv_vfmacc(C2i, alphai, ACC2r, gvl);
+            C3r = __riscv_vfnmsac(C3r, alphai, ACC3i, gvl);
+            C3i = __riscv_vfmacc(C3i, alphai, ACC3r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C2r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C2i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C3r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C3i, gvl);
+
+            m_top += 4;
+        }
+
+        // -- tails for main pass
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            double result8 = 0;
+            double result9 = 0;
+            double result10 = 0;
+            double result11 = 0;
+            double result12 = 0;
+            double result13 = 0;
+            double result14 = 0;
+            double result15 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                result8 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result9 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result10 += S0 * A[ai + 2 + 0] * B[bi + 4 + 0] + S1 * A[ai + 2 + 1] * B[bi + 4 + 1];
+                result11 += S2 * A[ai + 2 + 1] * B[bi + 4 + 0] + S3 * A[ai + 2 + 0] * B[bi + 4 + 1];
+                result12 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result13 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                result14 += S0 * A[ai + 2 + 0] * B[bi + 6 + 0] + S1 * A[ai + 2 + 1] * B[bi + 6 + 1];
+                result15 += S2 * A[ai + 2 + 1] * B[bi + 6 + 0] + S3 * A[ai + 2 + 0] * B[bi + 6 + 1];
+                ai += 2 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result8 * alphar;
+            Ci = result9 * alphar;
+            Cr -= result9 * alphai;
+            Ci += result8 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result10 * alphar;
+            Ci = result11 * alphar;
+            Cr -= result11 * alphai;
+            Ci += result10 * alphai;
+            C[(ci + 2 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result12 * alphar;
+            Ci = result13 * alphar;
+            Cr -= result13 * alphai;
+            Ci += result12 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result14 * alphar;
+            Ci = result15 * alphar;
+            Cr -= result15 * alphai;
+            Ci += result14 * alphai;
+            C[(ci + 3 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 4 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 4;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 4 + 0] + S1 * A[ai + 0 + 1] * B[bi + 4 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 4 + 0] + S3 * A[ai + 0 + 0] * B[bi + 4 + 1];
+                result6 += S0 * A[ai + 0 + 0] * B[bi + 6 + 0] + S1 * A[ai + 0 + 1] * B[bi + 6 + 1];
+                result7 += S2 * A[ai + 0 + 1] * B[bi + 6 + 0] + S3 * A[ai + 0 + 0] * B[bi + 6 + 1];
+                ai += 1 * 2;
+                bi += 4 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 2 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 2 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 3 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 3 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 4;
+    }
+
+    // -- tails for N=2
+
+    if (N & 2) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            double B1r = B[bi + 1 * 2 + 0];
+            double B1i = B[bi + 1 * 2 + 1];
+            bi += 2 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 10 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            vfloat64m2_t tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+            vfloat64m2_t tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+            vfloat64m2_t ACC1r = tmp1r;
+            vfloat64m2_t ACC1i = tmp1i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                B1r = B[bi + 1 * 2 + 0];
+                B1i = B[bi + 1 * 2 + 1];
+                bi += 2 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m2(A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m2(A0r, B1i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR(tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI(tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd(ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd(ACC1i, tmp1i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            vfloat64m2_t C1r = __riscv_vfmul(ACC1r, alphar, gvl);
+            vfloat64m2_t C1i = __riscv_vfmul(ACC1i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            C1r = __riscv_vfnmsac(C1r, alphai, ACC1i, gvl);
+            C1i = __riscv_vfmacc(C1i, alphai, ACC1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+            ci += ldc - gvl * 0;
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C1r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C1i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                result4 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result5 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                result6 += S0 * A[ai + 2 + 0] * B[bi + 2 + 0] + S1 * A[ai + 2 + 1] * B[bi + 2 + 1];
+                result7 += S2 * A[ai + 2 + 1] * B[bi + 2 + 0] + S3 * A[ai + 2 + 0] * B[bi + 2 + 1];
+                ai += 2 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            Cr = result4 * alphar;
+            Ci = result5 * alphar;
+            Cr -= result5 * alphai;
+            Ci += result4 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result6 * alphar;
+            Ci = result7 * alphar;
+            Cr -= result7 * alphai;
+            Ci += result6 * alphai;
+            C[(ci + 1 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 2 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 2;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 0 + 0] * B[bi + 2 + 0] + S1 * A[ai + 0 + 1] * B[bi + 2 + 1];
+                result3 += S2 * A[ai + 0 + 1] * B[bi + 2 + 0] + S3 * A[ai + 0 + 0] * B[bi + 2 + 1];
+                ai += 1 * 2;
+                bi += 2 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 1 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 1 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 2;
+    }
+
+    // -- tails for N=1
+
+    if (N & 1) {
+        gvl = __riscv_vsetvl_e64m2(4);
+        m_top = 0;
+
+        for (BLASLONG i = 0; i < M / 4; i += 1) {
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 4 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 4;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+            double B0r = B[bi + 0 * 2 + 0];
+            double B0i = B[bi + 0 * 2 + 1];
+            bi += 1 * 2;
+
+            vfloat64m2_t A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+            vfloat64m2_t A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+            ai += 4 * 2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            vfloat64m2_t tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+            vfloat64m2_t tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+            tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+            vfloat64m2_t ACC0r = tmp0r;
+            vfloat64m2_t ACC0i = tmp0i;
+
+            for (BLASLONG k = 1; k < pass_K; k++) {
+                B0r = B[bi + 0 * 2 + 0];
+                B0i = B[bi + 0 * 2 + 1];
+                bi += 1 * 2;
+
+                A0r = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2], sizeof(FLOAT) * 2, gvl);
+                A0i = __riscv_vlse64_v_f64m2(&A[ai + 0 * gvl * 2 + 1], sizeof(FLOAT) * 2, gvl);
+                ai += 4 * 2;
+
+                tmp0r = __riscv_vfmul_vf_f64m2(A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m2(A0r, B0i, gvl);
+                tmp0r = VFMACC_RR(tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI(tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd(ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd(ACC0i, tmp0i, gvl);
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+
+            vfloat64m2_t C0r = __riscv_vfmul(ACC0r, alphar, gvl);
+            vfloat64m2_t C0i = __riscv_vfmul(ACC0i, alphar, gvl);
+            C0r = __riscv_vfnmsac(C0r, alphai, ACC0i, gvl);
+            C0i = __riscv_vfmacc(C0i, alphai, ACC0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 0], sizeof(FLOAT) * 2, C0r, gvl);
+            __riscv_vsse64_v_f64m2(&C[ci * 2 + 1], sizeof(FLOAT) * 2, C0i, gvl);
+
+            m_top += 4;
+        }
+
+        if (M & 2) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 2 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 2;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                result2 += S0 * A[ai + 2 + 0] * B[bi + 0 + 0] + S1 * A[ai + 2 + 1] * B[bi + 0 + 1];
+                result3 += S2 * A[ai + 2 + 1] * B[bi + 0 + 0] + S3 * A[ai + 2 + 0] * B[bi + 0 + 1];
+                ai += 2 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            Cr = result2 * alphar;
+            Ci = result3 * alphar;
+            Cr -= result3 * alphai;
+            Ci += result2 * alphai;
+            C[(ci + 0 * ldc + 1) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 1) * 2 + 1] = Ci;
+            m_top += 2;
+        }
+
+        if (M & 1) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai = m_top * K * 2;
+            BLASLONG bi = n_top * K * 2;
+            BLASLONG pass_K = K;
+#ifdef LEFT
+            BLASLONG off = offset + m_top;
+#else
+            BLASLONG off = -offset + n_top;
+#endif
+#ifdef BACKWARDS
+            ai += off * 1 * 2;
+            bi += off * 1 * 2;
+            pass_K -= off;
+#else
+#ifdef LEFT
+            pass_K = off + 1;
+#else
+            pass_K = off + 1;
+#endif
+#endif
+
+            for (BLASLONG k = 0; k < pass_K; k++) {
+                result0 += S0 * A[ai + 0 + 0] * B[bi + 0 + 0] + S1 * A[ai + 0 + 1] * B[bi + 0 + 1];
+                result1 += S2 * A[ai + 0 + 1] * B[bi + 0 + 0] + S3 * A[ai + 0 + 0] * B[bi + 0 + 1];
+                ai += 1 * 2;
+                bi += 1 * 2;
+            }
+
+            BLASLONG ci = n_top * ldc + m_top;
+            double Cr, Ci;
+            Cr = result0 * alphar;
+            Ci = result1 * alphar;
+            Cr -= result1 * alphai;
+            Ci += result0 * alphai;
+            C[(ci + 0 * ldc + 0) * 2 + 0] = Cr;
+            C[(ci + 0 * ldc + 0) * 2 + 1] = Ci;
+            m_top += 1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c
new file mode 100644
index 000000000..de7622b89
--- /dev/null
+++ b/kernel/riscv64/ztrmm_kernel_8x4_zvl256b.c
@@ -0,0 +1,1337 @@
+/*
+
+AUTOGENERATED KERNEL
+Settings:
+ LMUL=1
+ M=8
+ M_tail_scalar_from=1
+ N=4
+ __riscv_='__riscv_'
+ complex=True
+ conjugate=False
+ cpu='zvl256b'
+ force_acc_double=False
+ index_type='BLASLONG'
+ op='trmm'
+ param_precision='double'
+ reg_width_bits=256
+ tail_policy=''
+ trace=False
+
+Derived:
+ ELEN_ACC=64
+ ELEN_PARAM=64
+ LMUL_ACC=1
+ VFMACC='__riscv_vfmacc_vf_f64m1'
+ VFMUL='__riscv_vfmul_vf_f64m1'
+ VLEV='__riscv_vle64_v_f64m1'
+ VLSEV='__riscv_vlse64_v_f64m1'
+ VMACC_TO_ACC='__riscv_vfmacc_vf_f64m1'
+ VMUL_TO_ACC='__riscv_vfmul_vf_f64m1'
+ VSETVL='__riscv_vsetvl_e64m1'
+ VSEV='__riscv_vse64_v_f64m1'
+ VSSEV='__riscv_vsse64_v_f64m1'
+ acc_vector_t='vfloat64m1_t'
+ output='ztrmm_kernel_8x4_zvl256b.c'
+ param_scalar_t='double'
+ param_vector_t='vfloat64m1_t'
+
+*/
+
+#include "common.h"
+
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+    #define S0  1
+    #define S1 -1
+    #define S2  1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfmacc
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+    #define S0  1
+    #define S1  1
+    #define S2  1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfmsac
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+    #define S0  1
+    #define S1  1
+    #define S2 -1
+    #define S3  1
+    #define VFMACC_RR __riscv_vfmacc
+    #define VFMACC_RI __riscv_vfnmsac
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+    #define S0  1
+    #define S1 -1
+    #define S2 -1
+    #define S3 -1
+    #define VFMACC_RR __riscv_vfmsac
+    #define VFMACC_RI __riscv_vfnmacc
+#endif
+
+
+#if defined(LEFT) != defined(TRANSA)
+    #define BACKWARDS
+#endif
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alphar, FLOAT alphai, FLOAT* A, FLOAT* B, FLOAT* C, BLASLONG ldc, BLASLONG offset)
+
+{
+    BLASLONG gvl = 0;
+    BLASLONG m_top = 0;
+    BLASLONG n_top = 0;
+
+
+    // -- MAIN PASS
+
+    for (BLASLONG j=0; j<N/4; j+=1) {
+        m_top = 0;
+        BLASLONG gvl = __riscv_vsetvl_e64m1(4);
+
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 16 regs to hold values accumulated over k
+            // leaving 12 vector registers for temporaries
+            // performing 4 operations between reuses of temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+            tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+            tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+            tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+            tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+            vfloat64m1_t ACC4r = tmp0r;
+            vfloat64m1_t ACC4i = tmp0i;
+            vfloat64m1_t ACC5r = tmp1r;
+            vfloat64m1_t ACC5i = tmp1i;
+            vfloat64m1_t ACC6r = tmp2r;
+            vfloat64m1_t ACC6i = tmp2i;
+            vfloat64m1_t ACC7r = tmp3r;
+            vfloat64m1_t ACC7i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B2i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B2i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B2r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B2r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B2r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B2r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B3r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B3r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A1i, gvl);
+                ACC4r = __riscv_vfadd( ACC4r, tmp0r, gvl);
+                ACC4i = __riscv_vfadd( ACC4i, tmp0i, gvl);
+                ACC5r = __riscv_vfadd( ACC5r, tmp1r, gvl);
+                ACC5i = __riscv_vfadd( ACC5i, tmp1i, gvl);
+                ACC6r = __riscv_vfadd( ACC6r, tmp2r, gvl);
+                ACC6i = __riscv_vfadd( ACC6i, tmp2i, gvl);
+                ACC7r = __riscv_vfadd( ACC7r, tmp3r, gvl);
+                ACC7i = __riscv_vfadd( ACC7i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            vfloat64m1_t C4r = __riscv_vfmul( ACC4r, alphar, gvl );
+            vfloat64m1_t C4i = __riscv_vfmul( ACC4i, alphar, gvl );
+            vfloat64m1_t C5r = __riscv_vfmul( ACC5r, alphar, gvl );
+            vfloat64m1_t C5i = __riscv_vfmul( ACC5i, alphar, gvl );
+            vfloat64m1_t C6r = __riscv_vfmul( ACC6r, alphar, gvl );
+            vfloat64m1_t C6i = __riscv_vfmul( ACC6i, alphar, gvl );
+            vfloat64m1_t C7r = __riscv_vfmul( ACC7r, alphar, gvl );
+            vfloat64m1_t C7i = __riscv_vfmul( ACC7i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            C4r = __riscv_vfnmsac( C4r, alphai, ACC4i, gvl );
+            C4i = __riscv_vfmacc ( C4i, alphai, ACC4r, gvl );
+            C5r = __riscv_vfnmsac( C5r, alphai, ACC5i, gvl );
+            C5i = __riscv_vfmacc ( C5i, alphai, ACC5r, gvl );
+            C6r = __riscv_vfnmsac( C6r, alphai, ACC6i, gvl );
+            C6i = __riscv_vfmacc ( C6i, alphai, ACC6r, gvl );
+            C7r = __riscv_vfnmsac( C7r, alphai, ACC7i, gvl );
+            C7i = __riscv_vfmacc ( C7i, alphai, ACC7r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C4r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C4i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C5r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C5i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C6r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C6i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C7r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C7i, gvl);
+            
+            m_top += 8;
+        }
+
+
+
+        // -- tails for main pass
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            double B2r = B[bi+2*2+0];
+            double B2i = B[bi+2*2+1];
+            double B3r = B[bi+3*2+0];
+            double B3i = B[bi+3*2+1];
+            bi += 4*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 22 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                B2r = B[bi+2*2+0];
+                B2i = B[bi+2*2+1];
+                B3r = B[bi+3*2+0];
+                B3i = B[bi+3*2+1];
+                bi += 4*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B2i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B2i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A0i, B3i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A0r, B3i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B2r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B2r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B3r, A0r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B3r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            double result4 = 0;
+            double result5 = 0;
+            double result6 = 0;
+            double result7 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*4*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 4;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                result4+=S0*A[ai+0+0]*B[bi+4+0] + S1*A[ai+0+1]*B[bi+4+1];
+                result5+=S2*A[ai+0+1]*B[bi+4+0] + S3*A[ai+0+0]*B[bi+4+1];
+                result6+=S0*A[ai+0+0]*B[bi+6+0] + S1*A[ai+0+1]*B[bi+6+1];
+                result7+=S2*A[ai+0+1]*B[bi+6+0] + S3*A[ai+0+0]*B[bi+6+1];
+                ai+=1*2;
+                bi+=4*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            Cr = result4*alphar;
+            Ci = result5*alphar;
+            Cr -= result5*alphai;
+            Ci += result4*alphai;
+            C[(ci+2*ldc+0)*2+0] = Cr;
+            C[(ci+2*ldc+0)*2+1] = Ci;
+            Cr = result6*alphar;
+            Ci = result7*alphar;
+            Cr -= result7*alphai;
+            Ci += result6*alphai;
+            C[(ci+3*ldc+0)*2+0] = Cr;
+            C[(ci+3*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 4;
+    }
+
+
+
+    // -- tails for N=2
+
+    if( N & 2 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 8 regs to hold values accumulated over k
+            // leaving 20 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            vfloat64m1_t tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            vfloat64m1_t tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+            vfloat64m1_t tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+            tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+            tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+            tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+            vfloat64m1_t ACC2r = tmp2r;
+            vfloat64m1_t ACC2i = tmp2i;
+            vfloat64m1_t ACC3r = tmp3r;
+            vfloat64m1_t ACC3i = tmp3i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp2r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp2i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp3r = __riscv_vfmul_vf_f64m1( A1i, B1i, gvl);
+                tmp3i = __riscv_vfmul_vf_f64m1( A1r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                tmp2r = VFMACC_RR( tmp2r, B1r, A0r, gvl);
+                tmp2i = VFMACC_RI( tmp2i, B1r, A0i, gvl);
+                tmp3r = VFMACC_RR( tmp3r, B1r, A1r, gvl);
+                tmp3i = VFMACC_RI( tmp3i, B1r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+                ACC2r = __riscv_vfadd( ACC2r, tmp2r, gvl);
+                ACC2i = __riscv_vfadd( ACC2i, tmp2i, gvl);
+                ACC3r = __riscv_vfadd( ACC3r, tmp3r, gvl);
+                ACC3i = __riscv_vfadd( ACC3i, tmp3i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            vfloat64m1_t C2r = __riscv_vfmul( ACC2r, alphar, gvl );
+            vfloat64m1_t C2i = __riscv_vfmul( ACC2i, alphar, gvl );
+            vfloat64m1_t C3r = __riscv_vfmul( ACC3r, alphar, gvl );
+            vfloat64m1_t C3i = __riscv_vfmul( ACC3i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            C2r = __riscv_vfnmsac( C2r, alphai, ACC2i, gvl );
+            C2i = __riscv_vfmacc ( C2i, alphai, ACC2r, gvl );
+            C3r = __riscv_vfnmsac( C3r, alphai, ACC3i, gvl );
+            C3i = __riscv_vfmacc ( C3i, alphai, ACC3r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            ci += ldc-gvl*1;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C2r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C2i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C3r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C3i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            double B1r = B[bi+1*2+0];
+            double B1i = B[bi+1*2+1];
+            bi += 2*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 26 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                B1r = B[bi+1*2+0];
+                B1i = B[bi+1*2+1];
+                bi += 2*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A0i, B1i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A0r, B1i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B1r, A0r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B1r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            ci += ldc-gvl*0;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            double result2 = 0;
+            double result3 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*2*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 2;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                result2+=S0*A[ai+0+0]*B[bi+2+0] + S1*A[ai+0+1]*B[bi+2+1];
+                result3+=S2*A[ai+0+1]*B[bi+2+0] + S3*A[ai+0+0]*B[bi+2+1];
+                ai+=1*2;
+                bi+=2*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            Cr = result2*alphar;
+            Ci = result3*alphar;
+            Cr -= result3*alphai;
+            Ci += result2*alphai;
+            C[(ci+1*ldc+0)*2+0] = Cr;
+            C[(ci+1*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 2;
+    }
+
+
+
+    // -- tails for N=1
+
+    if( N & 1 ) {
+        gvl = __riscv_vsetvl_e64m1(4);
+        m_top = 0;
+
+        for (BLASLONG i=0; i<M/8; i+=1) {
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*8*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 8;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 8*2;
+
+            // 4 vector regs to hold A array contents, 4 regs to hold values accumulated over k
+            // leaving 24 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            vfloat64m1_t tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+            vfloat64m1_t tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+            tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+            vfloat64m1_t ACC1r = tmp1r;
+            vfloat64m1_t ACC1i = tmp1i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                A1r = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2], sizeof(FLOAT)*2, gvl );
+                A1i = __riscv_vlse64_v_f64m1( &A[ai+1*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 8*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp1r = __riscv_vfmul_vf_f64m1( A1i, B0i, gvl);
+                tmp1i = __riscv_vfmul_vf_f64m1( A1r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                tmp1r = VFMACC_RR( tmp1r, B0r, A1r, gvl);
+                tmp1i = VFMACC_RI( tmp1i, B0r, A1i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+                ACC1r = __riscv_vfadd( ACC1r, tmp1r, gvl);
+                ACC1i = __riscv_vfadd( ACC1i, tmp1i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            vfloat64m1_t C1r = __riscv_vfmul( ACC1r, alphar, gvl );
+            vfloat64m1_t C1i = __riscv_vfmul( ACC1i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            C1r = __riscv_vfnmsac( C1r, alphai, ACC1i, gvl );
+            C1i = __riscv_vfmacc ( C1i, alphai, ACC1r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+             ci += gvl;
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C1r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C1i, gvl);
+            
+            m_top += 8;
+        }
+
+
+        if( M & 4 ) {
+            gvl = __riscv_vsetvl_e64m1(4);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*4*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 4;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 4*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 4*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 4;
+        }
+
+
+        if( M & 2 ) {
+            gvl = __riscv_vsetvl_e64m1(2);
+
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*2*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 2;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+            double B0r = B[bi+0*2+0];
+            double B0i = B[bi+0*2+1];
+            bi += 1*2;
+
+            vfloat64m1_t A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+            vfloat64m1_t A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+            ai += 2*2;
+
+            // 2 vector regs to hold A array contents, 2 regs to hold values accumulated over k
+            // leaving 28 vector registers for temporaries
+            vfloat64m1_t tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+            vfloat64m1_t tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+            tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+            tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+            vfloat64m1_t ACC0r = tmp0r;
+            vfloat64m1_t ACC0i = tmp0i;
+
+            for(BLASLONG k=1; k<pass_K; k++) {
+                B0r = B[bi+0*2+0];
+                B0i = B[bi+0*2+1];
+                bi += 1*2;
+
+                A0r = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2], sizeof(FLOAT)*2, gvl );
+                A0i = __riscv_vlse64_v_f64m1( &A[ai+0*gvl*2+1], sizeof(FLOAT)*2, gvl );
+                ai += 2*2;
+
+                tmp0r = __riscv_vfmul_vf_f64m1( A0i, B0i, gvl);
+                tmp0i = __riscv_vfmul_vf_f64m1( A0r, B0i, gvl);
+                tmp0r = VFMACC_RR( tmp0r, B0r, A0r, gvl);
+                tmp0i = VFMACC_RI( tmp0i, B0r, A0i, gvl);
+                ACC0r = __riscv_vfadd( ACC0r, tmp0r, gvl);
+                ACC0i = __riscv_vfadd( ACC0i, tmp0i, gvl);
+            }
+
+
+            BLASLONG ci=n_top*ldc+m_top;
+
+            vfloat64m1_t C0r = __riscv_vfmul( ACC0r, alphar, gvl );
+            vfloat64m1_t C0i = __riscv_vfmul( ACC0i, alphar, gvl );
+            C0r = __riscv_vfnmsac( C0r, alphai, ACC0i, gvl );
+            C0i = __riscv_vfmacc ( C0i, alphai, ACC0r, gvl );
+            __riscv_vsse64_v_f64m1( &C[ci*2+0], sizeof(FLOAT)*2, C0r, gvl);
+            __riscv_vsse64_v_f64m1( &C[ci*2+1], sizeof(FLOAT)*2, C0i, gvl);
+            
+            m_top += 2;
+        }
+
+
+        if( M & 1 ) {
+            double result0 = 0;
+            double result1 = 0;
+            BLASLONG ai=m_top*K*2;
+            BLASLONG bi=n_top*K*2;
+            BLASLONG pass_K = K;
+            #ifdef LEFT
+                BLASLONG off = offset + m_top;
+            #else
+                BLASLONG off = -offset + n_top;
+            #endif
+            #ifdef BACKWARDS
+                ai += off*1*2;
+                bi += off*1*2;
+                pass_K -= off;
+            #else
+                #ifdef LEFT
+                    pass_K = off + 1;
+                #else
+                    pass_K = off + 1;
+                #endif
+            #endif
+
+            for(BLASLONG k=0; k<pass_K; k++) {
+                result0+=S0*A[ai+0+0]*B[bi+0+0] + S1*A[ai+0+1]*B[bi+0+1];
+                result1+=S2*A[ai+0+1]*B[bi+0+0] + S3*A[ai+0+0]*B[bi+0+1];
+                ai+=1*2;
+                bi+=1*2;
+            }
+
+            BLASLONG ci=n_top*ldc+m_top;
+            double Cr, Ci;
+            Cr = result0*alphar;
+            Ci = result1*alphar;
+            Cr -= result1*alphai;
+            Ci += result0*alphai;
+            C[(ci+0*ldc+0)*2+0] = Cr;
+            C[(ci+0*ldc+0)*2+1] = Ci;
+            m_top+=1;
+        }
+
+        n_top += 1;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrmm_lncopy_rvv_v1.c b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..ae664561b
--- /dev/null
+++ b/kernel/riscv64/ztrmm_lncopy_rvv_v1.c
@@ -0,0 +1,155 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vint32m2_t
+#define VID_V_UINT              __riscv_vid_v_i32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgt_vx_i32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_i32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+    
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1;
+
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
+
+                ao  += 2;
+                b   += vl * 2;
+
+                X ++;
+                i ++;
+            } 
+            else if (X < posY) 
+            {
+                ao  += lda * 2;
+                b   += vl * 2;
+                X ++;
+                i ++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
+#endif
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..ab8d34337
--- /dev/null
+++ b/kernel/riscv64/ztrmm_ltcopy_rvv_v1.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+
+    FLOAT *ao;
+    
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1;
+    size_t vl;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        }
+
+        i = 0;
+        do 
+        {
+            if (X > posY) 
+            {
+                ao  += 2;
+                b   += vl * 2;
+                X++;
+                i++;
+            } 
+            else if (X < posY) 
+            {
+                //va1 = VLEV_FLOAT(ao, vl);
+                vax2 = VLSEG2_FLOAT(ao, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
+
+                ao  += lda * 2;
+                b   += vl * 2;
+                X ++;
+                i ++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    //va1 = VLEV_FLOAT(ao, vl);
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
+#endif
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                    ao  += lda * 2;
+                    b   += vl * 2;
+                }
+                X += vl;
+                i += vl;
+
+            }
+        } while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrmm_uncopy_rvv_v1.c b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..ba6e63b96
--- /dev/null
+++ b/kernel/riscv64/ztrmm_uncopy_rvv_v1.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEV_FLOAT             __riscv_vlse32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEV_FLOAT             __riscv_vlse64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, js, X;
+    BLASLONG stride_lda = sizeof(FLOAT) * lda * 2;
+    FLOAT *ao;
+
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1;
+    size_t vl;
+
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
+
+                ao  += 2;
+                b   += vl * 2;
+
+                X++;
+                i++;
+            } 
+            else if (X > posY) 
+            {
+                ao  += lda * 2;
+                b   += vl * 2;
+
+                X++;
+                i++;
+            } 
+            else 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
+#endif
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+
+        posY += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmm_utcopy_rvv_v1.c b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..a624fff54
--- /dev/null
+++ b/kernel/riscv64/ztrmm_utcopy_rvv_v1.c
@@ -0,0 +1,151 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u32m2_b16
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#define VMSEQ_VX_UINT           __riscv_vmseq_vx_u64m2_b32
+#define VFMERGE_VFM_FLOAT       __riscv_vfmerge_vfm_f64m2
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
+
+    BLASLONG i, j, js, X;
+
+    FLOAT *ao;
+
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1;
+#ifdef UNIT
+    VBOOL_T vbool_eq;
+#endif
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+
+        X = posX;
+
+        if (posX <= posY) 
+        {
+            ao = a + posX * 2 + posY * lda * 2;
+        } 
+        else 
+        {
+            ao = a + posY * 2 + posX * lda * 2;
+        }
+
+        i = 0;
+        do
+        {
+            if (X < posY) 
+            {
+                ao  += 2;
+                b   += vl * 2;
+                X++;
+                i++;
+            }
+            else if (X > posY)
+            {
+                vax2 = VLSEG2_FLOAT(ao, vl);
+                VSSEG2_FLOAT(b, vax2, vl);
+                ao  += lda * 2;
+                b   += vl * 2;
+                X++;
+                i++;
+            }
+            else
+            {
+                vindex  = VID_V_UINT(vl);
+                for (j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    va0 = VGET_VX2(vax2, 0);
+                    va1 = VGET_VX2(vax2, 1);
+
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    va0 = VFMERGE_VFM_FLOAT(va0, ZERO, vbool_cmp, vl);
+                    va1 = VFMERGE_VFM_FLOAT(va1, ZERO, vbool_cmp, vl);
+#ifdef UNIT
+                    vbool_eq = VMSEQ_VX_UINT(vindex, j, vl);
+                    va0 =  VFMERGE_VFM_FLOAT(va0, ONE, vbool_eq, vl);
+                    va1 =  VFMERGE_VFM_FLOAT(va1, ZERO, vbool_eq, vl);
+#endif
+                    vax2 = VSET_VX2(vax2, 0, va0);
+                    vax2 = VSET_VX2(vax2, 1, va1);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                    ao += lda * 2;
+                    b += vl * 2;
+                }
+                X += vl;
+                i += vl;
+            }
+        }while (i < m);
+        posY += vl;
+    }
+
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrmmkernel_2x2_rvv.c b/kernel/riscv64/ztrmmkernel_2x2_rvv.c
new file mode 100644
index 000000000..399124d2e
--- /dev/null
+++ b/kernel/riscv64/ztrmmkernel_2x2_rvv.c
@@ -0,0 +1,596 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e32m2()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e32m1()
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_V_T_M1            vfloat32m1_t
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VLSEG4_FLOAT            __riscv_vlseg4e32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f32m2
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f32m2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m2_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f32m1_f32
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define VSETVL_MAX              __riscv_vsetvlmax_e64m2()
+#define VSETVL_MAX_M1           __riscv_vsetvlmax_e64m1()
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_V_T_M1            vfloat64m1_t
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VLSEG4_FLOAT            __riscv_vlseg4e64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFMACCVV_FLOAT          __riscv_vfmacc_vv_f64m2
+#define VFNMSACVV_FLOAT         __riscv_vfnmsac_vv_f64m2
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m2_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
+#define VFMVFS_FLOAT_M1         __riscv_vfmv_f_s_f64m1_f64
+#endif
+
+// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c
+
+
+/********************************
+  ADD1 a*c
+  ADD2 b*c
+  ADD3 a*d
+  ADD4 b*d
+ *********************************/
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
+        FLOAT* C,BLASLONG ldc, BLASLONG offset)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*ptrba,*ptrbb;
+    FLOAT res0,res1;
+    BLASLONG off, temp;
+    
+    FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+    FLOAT_V_T_M1 v_m1_res0, v_m1_res1;
+    FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
+
+    size_t vl;
+    size_t vlmax = VSETVL_MAX;
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+
+    for (j = bn/2; j > 0; j--)
+    {
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        C0 = C;
+        C1 = C0+2*ldc;
+        ptrba = ba;
+
+        for (i = bm/2; i > 0; i--)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2*2;
+            ptrbb = bb+off*2*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+            vres4 = VFMVVF_FLOAT(0.0, vlmax);
+            vres5 = VFMVVF_FLOAT(0.0, vlmax);
+            vres6 = VFMVVF_FLOAT(0.0, vlmax);
+            vres7 = VFMVVF_FLOAT(0.0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk - off;
+#elif defined(LEFT)
+            temp = off + 2;
+#else
+            temp = off + 2;
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+
+                vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
+                vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
+                vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
+
+                vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
+                vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
+                vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
+
+#endif
+                ptrba += vl * 4;
+                ptrbb += vl * 4;
+            }
+            
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[2] = res0 * alphar - res1 * alphai;
+            C0[3] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C1[0] = res0 * alphar - res1 * alphai;
+            C1[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C1[2] = res0 * alphar - res1 * alphai;
+            C1[3] = res1 * alphar + res0 * alphai;
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2;
+#else
+            temp -= 2;
+#endif
+
+            ptrba += temp*2*2;
+            ptrbb += temp*2*2;
+
+#endif
+
+#ifdef LEFT
+            off += 2;
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+        }
+
+        if (bm & 1)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2*2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk - off;
+#elif defined(LEFT)
+            temp = off+1;
+#else
+            temp = off+2;
+#endif
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
+
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
+
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
+
+#endif
+                ptrba += vl * 2;
+                ptrbb += vl * 4;
+            }
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C1[0] = res0 * alphar - res1 * alphai;
+            C1[1] = res1 * alphar + res0 * alphai;
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1;
+#else
+            temp -= 2;
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2*2;
+#endif
+#ifdef LEFT
+            off += 1;
+#endif
+            C0 = C0+2;
+            C1 = C1+2;
+        }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+
+        for (i = bm/2; i > 0; i--)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2*2;
+            ptrbb = bb+off*2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+            vres2 = VFMVVF_FLOAT(0.0, vlmax);
+            vres3 = VFMVVF_FLOAT(0.0, vlmax);
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk - off;
+#elif defined(LEFT)
+            temp = off + 2;
+#else
+            temp = off + 1;
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
+
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+                vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
+                vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
+                vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
+
+#endif
+                ptrba += vl * 4;
+                ptrbb += vl * 2;
+            }
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            C0[2] = res0 * alphar - res1 * alphai;
+            C0[3] = res1 * alphar + res0 * alphai;
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk-off;
+#ifdef LEFT
+            temp -= 2;
+#else
+            temp -= 1;
+#endif
+            ptrba += temp*2*2;
+            ptrbb += temp*2;
+#endif
+#ifdef LEFT
+            off += 2;
+#endif
+            C0 = C0+4;
+        }
+
+        if (bm & 1)
+        {
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+            vres0 = VFMVVF_FLOAT(0.0, vlmax);
+            vres1 = VFMVVF_FLOAT(0.0, vlmax);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off + 1;
+#else
+            temp = off + 1;
+#endif
+
+            for (k = temp; k > 0; k -= vl)
+            {
+                vl = VSETVL(k);
+                VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
+                VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+                vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
+                vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
+                vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
+
+#endif
+                ptrba += vl * 2;
+                ptrbb += vl * 2;
+                
+            }
+            
+            v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
+            v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
+            res0 = VFMVFS_FLOAT_M1(v_m1_res0);
+            res1 = VFMVFS_FLOAT_M1(v_m1_res1);
+            
+            C0[0] = res0 * alphar - res1 * alphai;
+            C0[1] = res1 * alphar + res0 * alphai;
+
+#if ( defined(LEFT) &&  defined(TRANSA)) || \
+            (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1;
+#else
+            temp -= 1;
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+            
+#endif
+#ifdef LEFT
+            off += 1;
+#endif
+            C0 = C0+2;
+        }
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/ztrmmkernel_rvv_v1x4.c b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
new file mode 100644
index 000000000..db5f06af8
--- /dev/null
+++ b/kernel/riscv64/ztrmmkernel_rvv_v1x4.c
@@ -0,0 +1,632 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_V_T               vfloat32m2_t
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VGET_VX2                __riscv_vget_v_f32m2x2_f32m2
+#define VSET_VX2                __riscv_vset_v_f32m2_f32m2x2
+#define VLEV_FLOAT              __riscv_vle32_v_f32m2
+#define VSEV_FLOAT              __riscv_vse32_v_f32m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f32m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f32m2
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_V_T               vfloat64m2_t
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VGET_VX2                __riscv_vget_v_f64m2x2_f64m2
+#define VSET_VX2                __riscv_vset_v_f64m2_f64m2x2
+#define VLEV_FLOAT              __riscv_vle64_v_f64m2
+#define VSEV_FLOAT              __riscv_vse64_v_f64m2
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m2
+#define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m2
+#define VFNMSACVF_FLOAT         __riscv_vfnmsac_vf_f64m2
+#define VFMULVF_FLOAT           __riscv_vfmul_vf_f64m2
+#endif
+
+#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFMACCVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFMACCVF_FLOAT
+#define OP_ri       VFMACCVF_FLOAT
+#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define OP_rr       VFMACCVF_FLOAT
+#define OP_ir       VFNMSACVF_FLOAT
+#define OP_ii       VFNMSACVF_FLOAT
+#define OP_ri       VFNMSACVF_FLOAT
+#endif
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,FLOAT* C, BLASLONG ldc, BLASLONG offset)
+{
+    BLASLONG i,j,k;
+    FLOAT *C0, *C1, *C2, *C3, *ptrba,*ptrbb;
+	BLASLONG off, temp;
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+	off = -offset;
+#else
+	off = 0;
+#endif
+
+    FLOAT_VX2_T vax2;
+    FLOAT_V_T va0, va1, va2, va3, va4, va5, va6, va7;
+    FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
+
+    //fprintf(stderr, "%s, bn=%ld bm=%ld bk=%ld alphar=%f alphai=%f ldc=%ld, offset=%ld\n", __FUNCTION__, bn, bm, bk, alphar, alphai, ldc, offset); // Debug
+
+    size_t vl;
+    for (j = bn/4; j > 0; j--)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+        C2 = C1 + 2 * ldc;
+        C3 = C2 + 2 * ldc;
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl*2;
+            ptrbb = bb + off*4*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+            vres4 = VFMVVF_FLOAT(0.0, vl);
+            vres5 = VFMVVF_FLOAT(0.0, vl);
+            vres6 = VFMVVF_FLOAT(0.0, vl);
+            vres7 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+4;   // number of values in B
+#endif
+
+            for (k = temp/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va2, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va3, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va3, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va2, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va2, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va3, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va3, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va2, vl);
+                
+                ptrbb += 8;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va4, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va5, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va5, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va4, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va4, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va5, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va5, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va4, vl);
+                
+                ptrbb += 8;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va6, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va7, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va7, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va6, vl);
+                
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va6, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va7, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va7, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va6, vl);
+                
+                ptrbb += 8;
+            }
+
+            for (k = temp & 3; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                vres4 =  OP_rr(vres4, *(ptrbb + 4), va0, vl);
+                vres5 =  OP_ir(vres5, *(ptrbb + 4), va1, vl);
+                vres4 =  OP_ii(vres4, *(ptrbb + 5), va1, vl);
+                vres5 =  OP_ri(vres5, *(ptrbb + 5), va0, vl);
+
+                vres6 =  OP_rr(vres6, *(ptrbb + 6), va0, vl);
+                vres7 =  OP_ir(vres7, *(ptrbb + 6), va1, vl);
+                vres6 =  OP_ii(vres6, *(ptrbb + 7), va1, vl);
+                vres7 =  OP_ri(vres7, *(ptrbb + 7), va0, vl);
+
+                ptrbb += 8;
+            }
+            va0 =  VFMULVF_FLOAT(vres0, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+            va2 =  VFMULVF_FLOAT(vres2, alphar, vl);
+            va3 =  VFMULVF_FLOAT(vres3, alphar, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
+
+            va0 =  VFMULVF_FLOAT(vres4, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres5, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres5, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres4, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C2, vax2, vl);
+
+            va2 =  VFMULVF_FLOAT(vres6, alphar, vl);
+            va3 =  VFMULVF_FLOAT(vres7, alphar, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres7, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres6, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C3, vax2, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*vl*2;
+            ptrbb += temp*4*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+
+            C0 += vl * 2;
+            C1 += vl * 2;
+            C2 += vl * 2;
+            C3 += vl * 2;
+        }
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        bb += (bk << 3);
+        C  += (ldc << 3);
+    }
+
+    if (bn & 2)
+    {
+        C0 = C;
+        C1 = C0 + 2 * ldc;
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl*2;
+            ptrbb = bb + off*2*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+            vres2 = VFMVVF_FLOAT(0.0, vl);
+            vres3 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+2;   // number of values in B
+#endif
+            for (k = temp/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va2, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va3, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va3, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va2, vl);
+
+                ptrbb += 4;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va4, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va5, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va5, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va4, vl);
+                
+                ptrbb += 4;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va6, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va7, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va7, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va6, vl);
+                
+                ptrbb += 4;
+            }
+
+            for (k = temp & 3; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                vres2 =  OP_rr(vres2, *(ptrbb + 2), va0, vl);
+                vres3 =  OP_ir(vres3, *(ptrbb + 2), va1, vl);
+                vres2 =  OP_ii(vres2, *(ptrbb + 3), va1, vl);
+                vres3 =  OP_ri(vres3, *(ptrbb + 3), va0, vl);
+
+                ptrbb += 4;
+            }
+
+            va0 =  VFMULVF_FLOAT(vres0, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+            va2 =  VFMULVF_FLOAT(vres2, alphar, vl);
+            va3 =  VFMULVF_FLOAT(vres3, alphar, vl);
+            va2 = VFNMSACVF_FLOAT(va2, alphai, vres3, vl);
+            va3 =  VFMACCVF_FLOAT(va3, alphai, vres2, vl);
+
+            vax2 = VSET_VX2(vax2, 0, va2);
+            vax2 = VSET_VX2(vax2, 1, va3);
+            VSSEG2_FLOAT(C1, vax2, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*vl*2;
+            ptrbb += temp*2*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+            C0 += vl * 2;
+            C1 += vl * 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+        bb += (bk << 2);
+        C  += (ldc << 2);
+    }
+
+    if (bn & 1)
+    {
+        C0 = C;
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+        ptrba = ba;
+        for (i = bm; i > 0; i -= vl)
+        {
+            vl = VSETVL(i);
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*vl*2;
+            ptrbb = bb + off*2;
+#endif
+
+            vres0 = VFMVVF_FLOAT(0.0, vl);
+            vres1 = VFMVVF_FLOAT(0.0, vl);
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+vl;  // number of values in A
+#else
+            temp = off+1;   // number of values in B
+#endif
+            for (k = temp/4; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va2 = VGET_VX2(vax2, 0);
+                va3 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va4 = VGET_VX2(vax2, 0);
+                va5 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va2, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va3, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va3, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va2, vl);
+
+                ptrbb += 2;
+
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va6 = VGET_VX2(vax2, 0);
+                va7 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va4, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va5, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va5, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va4, vl);
+                
+                ptrbb += 2;
+                
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va6, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va7, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va7, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va6, vl);
+                
+                ptrbb += 2;
+            }
+
+            for (k = temp & 3; k > 0; k--)
+            {
+                vax2 = VLSEG2_FLOAT(ptrba, vl);
+                va0 = VGET_VX2(vax2, 0);
+                va1 = VGET_VX2(vax2, 1);
+                ptrba += vl*2;
+
+                vres0 =  OP_rr(vres0, *(ptrbb + 0), va0, vl);
+                vres1 =  OP_ir(vres1, *(ptrbb + 0), va1, vl);
+                vres0 =  OP_ii(vres0, *(ptrbb + 1), va1, vl);
+                vres1 =  OP_ri(vres1, *(ptrbb + 1), va0, vl);
+
+                ptrbb += 2;
+            }
+
+            va0 =  VFMULVF_FLOAT(vres0, alphar, vl);
+            va1 =  VFMULVF_FLOAT(vres1, alphar, vl);
+            va0 = VFNMSACVF_FLOAT(va0, alphai, vres1, vl);
+            va1 =  VFMACCVF_FLOAT(va1, alphai, vres0, vl);
+            
+            vax2 = VSET_VX2(vax2, 0, va0);
+            vax2 = VSET_VX2(vax2, 1, va1);
+            VSSEG2_FLOAT(C0, vax2, vl);
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= vl; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*vl*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += vl; // number of values in A
+#endif
+            C0 += vl * 2;
+        }
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+        bb += bk << 1;
+        C  += ldc << 1;
+   }
+   return 0;
+}
diff --git a/kernel/riscv64/ztrsm_lncopy_rvv_v1.c b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
new file mode 100644
index 000000000..36cec711d
--- /dev/null
+++ b/kernel/riscv64/ztrsm_lncopy_rvv_v1.c
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+
+    FLOAT_VX2_T vax2;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
+
+                    compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else
+            {
+                if (ii > jj)
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                }
+                ao  += 2;
+                b   += vl * 2;
+                i++;
+                ii++;
+            }
+        }
+
+        a += vl * lda * 2;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
new file mode 100644
index 000000000..3a7bdb522
--- /dev/null
+++ b/kernel/riscv64/ztrsm_ltcopy_rvv_v1.c
@@ -0,0 +1,114 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+
+    FLOAT_VX2_T vax2;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
+
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
+
+                    b   += vl * 2;
+                    ao  += lda * 2;
+                }
+                i += vl;
+                ii += vl;
+            }
+            else 
+            {
+                if (ii < jj) 
+                {
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                }
+                ao  += lda * 2;
+                b   += vl * 2;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl * 2;
+        jj += vl;
+    }
+    return 0;
+}
+
diff --git a/kernel/riscv64/ztrsm_uncopy_rvv_v1.c b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
new file mode 100644
index 000000000..2a158d4de
--- /dev/null
+++ b/kernel/riscv64/ztrsm_uncopy_rvv_v1.c
@@ -0,0 +1,113 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSSEG2_FLOAT           __riscv_vlsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSGTU_VX_UINT          __riscv_vmsgtu_vx_u64m2_b32
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+    BLASLONG stride_lda = sizeof(FLOAT)*lda*2;
+
+    FLOAT *ao;
+    jj = offset;
+
+    FLOAT_VX2_T vax2;
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        i = 0;
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    compinv((b + j * 2), *(ao + j * lda * 2), *(ao + j * lda * 2 + 1));
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    vbool_cmp = VMSGTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
+                    ao  += 2;
+                    b   += vl * 2;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else
+            {
+                if (ii < jj) 
+                {
+                    vax2 = VLSSEG2_FLOAT(ao, stride_lda, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                }
+                ao  += 2;
+                b   += vl * 2;
+                i++;
+                ii++;
+            }
+        } 
+
+        a += vl * lda * 2;
+        jj += vl;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/ztrsm_utcopy_rvv_v1.c b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
new file mode 100644
index 000000000..4b3319588
--- /dev/null
+++ b/kernel/riscv64/ztrsm_utcopy_rvv_v1.c
@@ -0,0 +1,115 @@
+/***************************************************************************
+Copyright (c) 2022, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define VSETVL(n)               __riscv_vsetvl_e32m2(n)
+#define FLOAT_VX2_T             vfloat32m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e32_v_f32m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e32_v_f32m2x2_m
+#define VBOOL_T                 vbool16_t
+#define UINT_V_T                vuint32m2_t
+#define VID_V_UINT              __riscv_vid_v_u32m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u32m2_b16
+#else
+#define VSETVL(n)               __riscv_vsetvl_e64m2(n)
+#define FLOAT_VX2_T             vfloat64m2x2_t
+#define VLSEG2_FLOAT            __riscv_vlseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT            __riscv_vsseg2e64_v_f64m2x2
+#define VSSEG2_FLOAT_M          __riscv_vsseg2e64_v_f64m2x2_m
+#define VBOOL_T                 vbool32_t
+#define UINT_V_T                vuint64m2_t
+#define VID_V_UINT              __riscv_vid_v_u64m2
+#define VMSLTU_VX_UINT          __riscv_vmsltu_vx_u64m2_b32
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
+
+    //fprintf(stderr, "%s , %s, m = %4ld  n = %4ld  lda = %4ld offset = %4ld\n", __FILE__, __FUNCTION__, m, n, lda, offset); // Debug
+
+    BLASLONG i, ii, jj, js;
+
+    FLOAT *ao;
+
+    jj = offset;
+    FLOAT_VX2_T vax2;
+
+    VBOOL_T vbool_cmp;
+    UINT_V_T vindex;
+
+    size_t vl;
+  
+    for (js = n; js > 0; js -= vl)
+    {
+        vl = VSETVL(js);
+        ao = a;
+
+        ii = 0;
+        for (i = 0; i < m;)
+        {
+
+            if (ii == jj) 
+            {
+                vindex  = VID_V_UINT(vl);
+                for (unsigned int j = 0; j < vl; j++) 
+                {
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    vbool_cmp = VMSLTU_VX_UINT(vindex, j, vl);
+                    VSSEG2_FLOAT_M(vbool_cmp, b, vax2, vl);
+
+                    compinv((b + j * 2), *(ao + j * 2), *(ao + j * 2 + 1));
+
+                    ao  += lda * 2;
+                    b   += vl * 2;
+                }
+                i += vl;
+                ii += vl;
+            } 
+            else 
+            {
+                if (ii > jj) 
+                {
+                    vax2 = VLSEG2_FLOAT(ao, vl);
+                    VSSEG2_FLOAT(b, vax2, vl);
+                }
+                ao  += lda * 2;
+                b   += vl * 2;
+                i ++;
+                ii ++;
+            }
+        }
+
+        a += vl * 2;
+        jj += vl;
+    }
+
+    return 0;
+}
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 14a339e75..4c361f155 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -1,5 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* Copyright 2023 The OpenBLAS Project.                              */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@@ -49,7 +50,9 @@
 static void init_parameter(void);
 
 gotoblas_t TABLE_NAME = {
-  DTB_DEFAULT_ENTRIES ,
+  DTB_DEFAULT_ENTRIES,
+
+  SWITCH_RATIO,
 
   GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
 
@@ -63,6 +66,7 @@ gotoblas_t TABLE_NAME = {
 #endif
 
   SBGEMM_ALIGN_K,
+  0, // need_amxtile_permission
 
   sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
 
@@ -1806,6 +1810,12 @@ static void init_parameter(void) {
 
 #endif
 
+#ifdef SAPPHIRERAPIDS
+#if (BUILD_BFLOAT16 == 1)
+  TABLE_NAME.need_amxtile_permission = 1;
+#endif
+#endif
+
 #if BUILD_COMPLEX==1
 #ifdef CGEMM3M_DEFAULT_P
   TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
diff --git a/kernel/sparc/cnrm2.S b/kernel/sparc/cnrm2.S
index 8dc4b56b6..0840c8848 100644
--- a/kernel/sparc/cnrm2.S
+++ b/kernel/sparc/cnrm2.S
@@ -76,7 +76,7 @@
 	FMOV	c1, t4
 
 	cmp	INCX, 0
-	ble	.LL20
+	beq	.LL20
 	sll	INCX, ZBASE_SHIFT, INCX
 
 	cmp	N, 0
diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S
index cf7522953..41e993440 100644
--- a/kernel/sparc/dnrm2.S
+++ b/kernel/sparc/dnrm2.S
@@ -107,7 +107,7 @@
 	FMOV	fzero, c1
 
 	cmp	INCX, 0
-	ble	.LL99
+	beq	.LL99
 	sll	INCX, BASE_SHIFT, INCX
 
 	add	%sp, -8, %sp
diff --git a/kernel/sparc/snrm2.S b/kernel/sparc/snrm2.S
index a80247259..a7405b6e1 100644
--- a/kernel/sparc/snrm2.S
+++ b/kernel/sparc/snrm2.S
@@ -76,7 +76,7 @@
 	FMOV	c1, t4
 
 	cmp	INCX, 0
-	ble	.LL20
+	beq	.LL20
 	sll	INCX, BASE_SHIFT, INCX
 
 	cmp	N, 0
diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S
index 065d22784..dae53ffe7 100644
--- a/kernel/sparc/znrm2.S
+++ b/kernel/sparc/znrm2.S
@@ -107,7 +107,7 @@
 	FMOV	fzero, c1
 
 	cmp	INCX, 0
-	ble	.LL99
+	beq	.LL99
 	sll	INCX, ZBASE_SHIFT, INCX
 
 	add	%sp, -8, %sp
diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S
index 7a14da862..3a6417462 100644
--- a/kernel/x86/nrm2.S
+++ b/kernel/x86/nrm2.S
@@ -78,7 +78,7 @@
 	testl	M, M
 	jle	.L999
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	sall	$BASE_SHIFT, INCX
 	fldz
diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S
index 0f174c408..129b41a03 100644
--- a/kernel/x86/nrm2_sse.S
+++ b/kernel/x86/nrm2_sse.S
@@ -69,7 +69,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	leal	(, INCX, SIZE), INCX
 	cmpl	$SIZE, INCX
diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S
index 263612e9a..7a65df77a 100644
--- a/kernel/x86/znrm2.S
+++ b/kernel/x86/znrm2.S
@@ -78,7 +78,7 @@
 	testl	M, M
 	jle	.L999
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	sall	$ZBASE_SHIFT, INCX
 	fldz
diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S
index bbc3677ae..4ad326120 100644
--- a/kernel/x86/znrm2_sse.S
+++ b/kernel/x86/znrm2_sse.S
@@ -69,7 +69,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testl	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	sall	$ZBASE_SHIFT, INCX
 
diff --git a/kernel/x86/zscal.S b/kernel/x86/zscal.S
index 1eb518563..adb56edb9 100644
--- a/kernel/x86/zscal.S
+++ b/kernel/x86/zscal.S
@@ -98,7 +98,7 @@
 	fcomip	%st(1), %st
 	ffreep	%st(0)
 	jne	.L30
-
+jp .L30
 	EMMS
 
 	pxor	%mm0, %mm0
diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S
index e011c98f5..89e36251e 100644
--- a/kernel/x86/zscal_sse.S
+++ b/kernel/x86/zscal_sse.S
@@ -87,6 +87,7 @@
 	xorps	%xmm7, %xmm7
 	comiss	%xmm0, %xmm7
 	jne	.L100		# Alpha_r != ZERO
+	jp	.L100		# Alpha_r NaN
 
 	comiss	%xmm1, %xmm7
 	jne	.L100		# Alpha_i != ZERO
diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S
index cc7ab6686..0bc61b209 100644
--- a/kernel/x86/zscal_sse2.S
+++ b/kernel/x86/zscal_sse2.S
@@ -98,6 +98,7 @@
 	xorps	%xmm7, %xmm7
 	comisd	%xmm0, %xmm7
 	jne	.L100
+	jp	.L100
 
 	comisd	%xmm1, %xmm7
 	jne	.L100
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index bea7036c2..ec4290e82 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -405,7 +405,7 @@ DGEMVNKERNEL = dgemv_n.S
 endif
 
 ifndef DGEMVTKERNEL
-DGEMVTKERNEL = dgemv_t.S
+DGEMVTKERNEL = dgemv_t_4.c
 endif
 
 ifndef CGEMVNKERNEL
@@ -489,5 +489,7 @@ XGEMM3MKERNEL    =  xgemm3m_kernel_2x2.S
 
 SSUMKERNEL = ../arm/sum.c
 DSUMKERNEL = ../arm/sum.c
+CSUMKERNEL = zsum_sse.S
+ZSUMKERNEL = zsum_sse2.S
 
 SOMATCOPY_RT = omatcopy_rt.c
diff --git a/kernel/x86_64/KERNEL.COOPERLAKE b/kernel/x86_64/KERNEL.COOPERLAKE
index dba94aea8..22b042029 100644
--- a/kernel/x86_64/KERNEL.COOPERLAKE
+++ b/kernel/x86_64/KERNEL.COOPERLAKE
@@ -1,5 +1,5 @@
 include $(KERNELDIR)/KERNEL.SKYLAKEX
-
+ifneq ($(NO_AVX512BF16), 1)
 SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_cooperlake.c
 SBGEMM_SMALL_K_NN = sbgemm_small_kernel_nn_cooperlake.c
 SBGEMM_SMALL_K_B0_NN = sbgemm_small_kernel_nn_cooperlake.c
@@ -20,3 +20,4 @@ SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
 SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
index 88f574668..0ab2b4ddc 100644
--- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
+++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS
@@ -1,6 +1,15 @@
 include $(KERNELDIR)/KERNEL.COOPERLAKE
 
-SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c
+ifneq ($(NO_AVX512BF16), 1)
+SBGEMM_SMALL_M_PERMIT =
+SBGEMM_SMALL_K_NN     =
+SBGEMM_SMALL_K_B0_NN  =
+SBGEMM_SMALL_K_NT     =
+SBGEMM_SMALL_K_B0_NT  =
+SBGEMM_SMALL_K_TN     =
+SBGEMM_SMALL_K_B0_TN  =
+SBGEMM_SMALL_K_TT     =
+SBGEMM_SMALL_K_B0_TT  =
 
 SBGEMM_BETA     = sgemm_beta_skylakex.c
 SBGEMMKERNEL    = sbgemm_kernel_16x16_spr.c
@@ -12,3 +21,4 @@ SBGEMMINCOPYOBJ =  sbgemm_incopy$(TSUFFIX).$(SUFFIX)
 SBGEMMITCOPYOBJ =  sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SBGEMMONCOPYOBJ =  sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SBGEMMOTCOPYOBJ =  sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 548e5dcfc..7e946ef2e 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -46,3 +46,5 @@ ZGEMMKERNEL    =  zgemm_kernel_4x2_skylakex.c
 
 CASUMKERNEL = casum.c
 ZASUMKERNEL = zasum.c
+CSUMKERNEL = csum.c
+ZSUMKERNEL = zsum.c
diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c
index 60feec0ce..28d78d279 100644
--- a/kernel/x86_64/casum.c
+++ b/kernel/x86_64/casum.c
@@ -4,17 +4,17 @@
 #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
 #endif
 
-#if defined(SKYLAKEX)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "casum_microk_skylakex-2.c"
 #endif
 
 #ifndef HAVE_CASUM_KERNEL
-static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
+static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
 {
 
     BLASLONG i=0;
     BLASLONG n_8 = n & -8;
-    FLOAT *x = x1;
+    FLOAT *x1 = x;
     FLOAT temp0, temp1, temp2, temp3;
     FLOAT temp4, temp5, temp6, temp7;
     FLOAT sum0 = 0.0;
@@ -24,14 +24,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
     FLOAT sum4 = 0.0;
     
     while (i < n_8) {
-        temp0 = ABS_K(x[0]);
-        temp1 = ABS_K(x[1]);
-        temp2 = ABS_K(x[2]);
-        temp3 = ABS_K(x[3]);
-        temp4 = ABS_K(x[4]);
-        temp5 = ABS_K(x[5]);
-        temp6 = ABS_K(x[6]);
-        temp7 = ABS_K(x[7]);
+        temp0 = ABS_K(x1[0]);
+        temp1 = ABS_K(x1[1]);
+        temp2 = ABS_K(x1[2]);
+        temp3 = ABS_K(x1[3]);
+        temp4 = ABS_K(x1[4]);
+        temp5 = ABS_K(x1[5]);
+        temp6 = ABS_K(x1[6]);
+        temp7 = ABS_K(x1[7]);
         
         sum0 += temp0;
         sum1 += temp1;
@@ -43,12 +43,12 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
         sum2 += temp6;
         sum3 += temp7;
         
-        x+=8;
+        x1+=8;
         i+=4;
     }
 
      while (i < n) {
-        sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
+        sum4 += ABS_K(x1[0]) + ABS_K(x1[1]);
         x1 += 2;
         i++;
      }
diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
index b398aa6e1..10b70ff20 100644
--- a/kernel/x86_64/casum_microk_skylakex-2.c
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -1,5 +1,10 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
+
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
 
 #define HAVE_CASUM_KERNEL 1
 
@@ -15,15 +20,14 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
     
     if (n2 < 64) {
         __m128 accum_10, accum_11, accum_12, accum_13;
-        __m128 abs_mask1 = abs_mask1;
+        __m128 abs_mask1;
 
         accum_10 = _mm_setzero_ps();
         accum_11 = _mm_setzero_ps();
         accum_12 = _mm_setzero_ps();
         accum_13 = _mm_setzero_ps();
         
-        abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
-        abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
+        abs_mask1 = (__m128)_mm_set1_epi32(0x7fffffff);
                 
         _mm_prefetch(&x1[0], _MM_HINT_T0);
         
@@ -347,3 +351,4 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
     return sumf;
 }
 #endif
+#endif
diff --git a/kernel/x86_64/cgemv_n_4.c b/kernel/x86_64/cgemv_n_4.c
index 3ca173c20..94e6d2c77 100644
--- a/kernel/x86_64/cgemv_n_4.c
+++ b/kernel/x86_64/cgemv_n_4.c
@@ -24,9 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
+#if (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
 
-#include <stdlib.h>
-#include <stdio.h>
 #include "common.h"
 
 #if defined(HASWELL) || defined(ZEN) || defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
diff --git a/kernel/x86_64/cgemv_t_4.c b/kernel/x86_64/cgemv_t_4.c
index 3187e196c..f123e81d1 100644
--- a/kernel/x86_64/cgemv_t_4.c
+++ b/kernel/x86_64/cgemv_t_4.c
@@ -24,7 +24,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-
+#if (defined(__GNUC__) && __GNUC__ > 11) 
+#pragma GCC optimize("no-tree-vectorize")
+#endif
 
 #include "common.h"
 
diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c
index 8a622427b..a6c012a4c 100644
--- a/kernel/x86_64/cscal_microk_skylakex-2.c
+++ b/kernel/x86_64/cscal_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/csum.c b/kernel/x86_64/csum.c
new file mode 100644
index 000000000..e85b5cae1
--- /dev/null
+++ b/kernel/x86_64/csum.c
@@ -0,0 +1,131 @@
+#include "common.h"
+
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
+#include "csum_microk_skylakex-2.c"
+#endif
+
+#ifndef HAVE_CSUM_KERNEL
+static FLOAT csum_kernel(BLASLONG n, FLOAT *x)
+{
+
+    BLASLONG i=0;
+    BLASLONG n_8 = n & -8;
+    FLOAT *x1 = x;
+    FLOAT temp0, temp1, temp2, temp3;
+    FLOAT temp4, temp5, temp6, temp7;
+    FLOAT sum0 = 0.0;
+    FLOAT sum1 = 0.0;
+    FLOAT sum2 = 0.0;
+    FLOAT sum3 = 0.0;
+    FLOAT sum4 = 0.0;
+    
+    while (i < n_8) {
+        sum0 += x1[0];
+        sum1 += x1[1];
+        sum2 += x1[2];
+        sum3 += x1[3];
+        
+        sum0 += x1[4];
+        sum1 += x1[5];
+        sum2 += x1[6];
+        sum3 += x1[7];
+        
+        x1+=8;
+        i+=4;
+    }
+
+     while (i < n) {
+        sum4 += (x1[0] + x1[1]);
+        x1 += 2;
+        i++;
+     }
+
+    return sum0+sum1+sum2+sum3+sum4;
+}
+
+#endif
+
+static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ip = 0;
+    BLASLONG inc_x2;
+    FLOAT sumf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(sumf);
+    if (inc_x == 1) {
+        sumf = csum_kernel(n, x);
+    }
+    else {
+        inc_x2 = 2 * inc_x;
+
+        while (i < n) {
+            sumf += x[ip] + x[ip + 1];
+            ip += inc_x2;
+            i++;
+        }
+    }
+
+    return(sumf);
+}
+
+#if defined(SMP)
+static int sum_thread_function(BLASLONG n, 
+        BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
+        FLOAT *x, BLASLONG inc_x,
+        FLOAT * dummy3, BLASLONG dummy4,
+        FLOAT * result, BLASLONG dummy5)
+{
+    *(FLOAT *) result = sum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, 
+        BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
+        void *a, BLASLONG lda, 
+        void *b, BLASLONG ldb,
+        void *c, BLASLONG ldc,
+        int (*function)(),
+        int nthread);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha[2];
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 10000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
+    
+    if (nthreads == 1) {
+        sumf = sum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_COMPLEX;
+#else
+        mode = BLAS_DOUBLE | BLAS_COMPLEX;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
+                NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = sum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
diff --git a/kernel/x86_64/csum_microk_skylakex-2.c b/kernel/x86_64/csum_microk_skylakex-2.c
new file mode 100644
index 000000000..ec882efa1
--- /dev/null
+++ b/kernel/x86_64/csum_microk_skylakex-2.c
@@ -0,0 +1,289 @@
+/* need a new enough GCC for avx512 support */
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
+
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
+
+#define HAVE_CASUM_KERNEL 1
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
+{
+    FLOAT *x1 = x;
+    FLOAT sumf=0.0;
+    BLASLONG n2 = n + n;
+    
+    if (n2 < 64) {
+        __m128 accum_10, accum_11, accum_12, accum_13;
+
+        accum_10 = _mm_setzero_ps();
+        accum_11 = _mm_setzero_ps();
+        accum_12 = _mm_setzero_ps();
+        accum_13 = _mm_setzero_ps();
+        
+        _mm_prefetch(&x1[0], _MM_HINT_T0);
+        
+        if (n2 >= 32){
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            __m128 x02 = _mm_loadu_ps(&x1[ 8]);
+            __m128 x03 = _mm_loadu_ps(&x1[12]);
+            
+            _mm_prefetch(&x1[16], _MM_HINT_T0);
+            __m128 x04 = _mm_loadu_ps(&x1[16]);
+            __m128 x05 = _mm_loadu_ps(&x1[20]);
+            __m128 x06 = _mm_loadu_ps(&x1[24]);
+            __m128 x07 = _mm_loadu_ps(&x1[28]);
+
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+            accum_12 = _mm_add_ps(accum_12, x02);
+            accum_13 = _mm_add_ps(accum_13, x03);
+
+            accum_10 = _mm_add_ps(accum_10, x04);
+            accum_11 = _mm_add_ps(accum_11, x05);
+            accum_12 = _mm_add_ps(accum_12, x06);
+            accum_13 = _mm_add_ps(accum_13, x07);
+
+            n2 -= 32;
+            x1 += 32;
+        }
+
+        if (n2 >= 16) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            __m128 x02 = _mm_loadu_ps(&x1[ 8]);
+            __m128 x03 = _mm_loadu_ps(&x1[12]);
+
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+            accum_12 = _mm_add_ps(accum_12, x02);
+            accum_13 = _mm_add_ps(accum_13, x03);
+            
+            n2 -= 16;
+            x1 += 16;
+        }
+
+        if (n2 >= 8) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+
+            n2 -= 8;
+            x1 += 8;
+        }
+        
+        if (n2 >= 4) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            accum_10 = _mm_add_ps(accum_10, x00);
+
+            n2 -= 4;
+            x1 += 4;
+        }
+
+        if (n2) {
+            sumf += (x1[0] + x1[1]);
+        }
+
+        accum_10 = _mm_add_ps(accum_10, accum_11);
+        accum_12 = _mm_add_ps(accum_12, accum_13);
+        accum_10 = _mm_add_ps(accum_10, accum_12);
+
+        accum_10 = _mm_hadd_ps(accum_10, accum_10);
+        accum_10 = _mm_hadd_ps(accum_10, accum_10);
+
+        sumf += accum_10[0];
+    }
+    else {
+        __m512 accum_0, accum_1, accum_2, accum_3;
+        __m512 x00, x01, x02, x03, x04, x05, x06, x07;
+        
+        accum_0 = _mm512_setzero_ps();
+        accum_1 = _mm512_setzero_ps();
+        accum_2 = _mm512_setzero_ps();
+        accum_3 = _mm512_setzero_ps();
+
+        // alignment has side-effect when the size of input array is not large enough
+        if (n2 < 256) {
+            if (n2 >= 128) {
+                x00 = _mm512_loadu_ps(&x1[  0]);
+                x01 = _mm512_loadu_ps(&x1[ 16]);
+                x02 = _mm512_loadu_ps(&x1[ 32]);
+                x03 = _mm512_loadu_ps(&x1[ 48]);
+                x04 = _mm512_loadu_ps(&x1[ 64]);
+                x05 = _mm512_loadu_ps(&x1[ 80]);
+                x06 = _mm512_loadu_ps(&x1[ 96]);
+                x07 = _mm512_loadu_ps(&x1[112]);
+
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+                
+                accum_0 = _mm512_add_ps(accum_0, x04);
+                accum_1 = _mm512_add_ps(accum_1, x05);
+                accum_2 = _mm512_add_ps(accum_2, x06);
+                accum_3 = _mm512_add_ps(accum_3, x07);
+                
+                n2 -= 128;
+                x1 += 128;
+            }
+
+            if (n2 >= 64) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x01 = _mm512_loadu_ps(&x1[16]);
+                x02 = _mm512_loadu_ps(&x1[32]);
+                x03 = _mm512_loadu_ps(&x1[48]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x01 = _mm512_loadu_ps(&x1[16]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2) {
+                uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
+                x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+            }
+            accum_0 = _mm512_add_ps(accum_0, accum_1);
+            accum_2 = _mm512_add_ps(accum_2, accum_3);
+            accum_0 = _mm512_add_ps(accum_0, accum_2);
+
+            sumf =  _mm512_reduce_add_ps(accum_0);
+        }
+        // n2 >= 256, doing alignment
+        else {
+
+            int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf;
+
+            if (0 != align_header) {
+                uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header));
+                x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= align_header;
+                x1 += align_header;
+            }
+
+            x00 = _mm512_load_ps(&x1[  0]);
+            x01 = _mm512_load_ps(&x1[ 16]);
+            x02 = _mm512_load_ps(&x1[ 32]);
+            x03 = _mm512_load_ps(&x1[ 48]);
+            x04 = _mm512_load_ps(&x1[ 64]);
+            x05 = _mm512_load_ps(&x1[ 80]);
+            x06 = _mm512_load_ps(&x1[ 96]);
+            x07 = _mm512_load_ps(&x1[112]);
+            
+            n2 -= 128;
+            x1 += 128;
+
+            while (n2 >= 128) {
+                
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                x00 = _mm512_load_ps(&x1[  0]);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                x01 = _mm512_load_ps(&x1[ 16]);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                x02 = _mm512_load_ps(&x1[ 32]);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+                x03 = _mm512_load_ps(&x1[ 48]);
+                
+                accum_0 = _mm512_add_ps(accum_0, x04);
+                x04 = _mm512_load_ps(&x1[ 64]);
+                accum_1 = _mm512_add_ps(accum_1, x05);
+                x05 = _mm512_load_ps(&x1[ 80]);
+                accum_2 = _mm512_add_ps(accum_2, x06);
+                x06 = _mm512_load_ps(&x1[ 96]);
+                accum_3 = _mm512_add_ps(accum_3, x07);
+                x07 = _mm512_load_ps(&x1[112]);
+
+                n2 -= 128;
+                x1 += 128;
+            }
+            
+            accum_0 = _mm512_add_ps(accum_0, x00);
+            accum_1 = _mm512_add_ps(accum_1, x01);
+            accum_2 = _mm512_add_ps(accum_2, x02);
+            accum_3 = _mm512_add_ps(accum_3, x03);
+            
+            accum_0 = _mm512_add_ps(accum_0, x04);
+            accum_1 = _mm512_add_ps(accum_1, x05);
+            accum_2 = _mm512_add_ps(accum_2, x06);
+            accum_3 = _mm512_add_ps(accum_3, x07);
+
+            if (n2 >= 64) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x01 = _mm512_load_ps(&x1[16]);
+                x02 = _mm512_load_ps(&x1[32]);
+                x03 = _mm512_load_ps(&x1[48]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x01 = _mm512_load_ps(&x1[16]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2) {
+                uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
+                x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+            }
+
+            accum_0 = _mm512_add_ps(accum_0, accum_1);
+            accum_2 = _mm512_add_ps(accum_2, accum_3);
+            accum_0 = _mm512_add_ps(accum_0, accum_2);
+            sumf = _mm512_reduce_add_ps(accum_0);
+        }
+    }
+
+    return sumf;
+}
+#endif
+#endif
diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c
index a9c40f38f..0147c6978 100644
--- a/kernel/x86_64/dasum.c
+++ b/kernel/x86_64/dasum.c
@@ -4,7 +4,7 @@
 #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
 #endif
 
-#if defined(SKYLAKEX)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "dasum_microk_skylakex-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "dasum_microk_haswell-2.c"
diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c
index fd9da7ebe..bc27c7647 100644
--- a/kernel/x86_64/dasum_microk_haswell-2.c
+++ b/kernel/x86_64/dasum_microk_haswell-2.c
@@ -1,4 +1,7 @@
-#if (( defined(__GNUC__)  && __GNUC__   > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_DASUM_KERNEL
 
diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c
index 83bc078b3..76b9fbef0 100644
--- a/kernel/x86_64/dasum_microk_skylakex-2.c
+++ b/kernel/x86_64/dasum_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_DASUM_KERNEL 1
 
diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c
index e785a39f1..5b9147d10 100644
--- a/kernel/x86_64/daxpy_microk_skylakex-2.c
+++ b/kernel/x86_64/daxpy_microk_skylakex-2.c
@@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c
index f3b9ee701..569ed2416 100644
--- a/kernel/x86_64/ddot.c
+++ b/kernel/x86_64/ddot.c
@@ -159,7 +159,7 @@ static int dot_thread_function(BLASLONG n, BLASLONG dummy0,
 
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
         BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
-        void *c, BLASLONG ldc, int (*function)(), int nthreads);
+        void *c, BLASLONG ldc, int (*function)(void), int nthreads);
 #endif
 
 FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c
index 8eabf225a..f076862f7 100644
--- a/kernel/x86_64/ddot_microk_skylakex-2.c
+++ b/kernel/x86_64/ddot_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_8 1
 
diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
index a98772b94..da57a18a7 100644
--- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c
@@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
index 37d1ca497..69ad6d94e 100644
--- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
+++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c
@@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c
index 4030399ab..4e8739864 100644
--- a/kernel/x86_64/dgemv_n_microk_skylakex-4.c
+++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_4x4 1
 
diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c
index 40c9cf19d..6fdf4ae56 100644
--- a/kernel/x86_64/drot.c
+++ b/kernel/x86_64/drot.c
@@ -169,7 +169,7 @@ static int rot_thread_function(blas_arg_t *args)
     return 0;
 }
 
-extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(void), int nthreads);
 #endif
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
 {
diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c
index 4e862e663..bf9c044d4 100644
--- a/kernel/x86_64/drot_microk_skylakex-2.c
+++ b/kernel/x86_64/drot_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_DROT_KERNEL 1
 
diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c
index e0598272e..381136414 100644
--- a/kernel/x86_64/dscal_microk_skylakex-2.c
+++ b/kernel/x86_64/dscal_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
index f0df5aaa8..ca4773a4b 100644
--- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c
+++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
@@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/dtobf16_microk_cooperlake.c b/kernel/x86_64/dtobf16_microk_cooperlake.c
index 9b8ac4714..b713b39be 100644
--- a/kernel/x86_64/dtobf16_microk_cooperlake.c
+++ b/kernel/x86_64/dtobf16_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_TOBF16_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S
index b79ac2adb..61cf8c452 100644
--- a/kernel/x86_64/nrm2.S
+++ b/kernel/x86_64/nrm2.S
@@ -58,7 +58,7 @@
 	testq	M, M
 	jle	.L999
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	salq	$BASE_SHIFT, INCX
 
diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S
index 33b1ee496..c1f3a45fc 100644
--- a/kernel/x86_64/nrm2_sse.S
+++ b/kernel/x86_64/nrm2_sse.S
@@ -57,7 +57,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	pxor	%xmm2, %xmm2
 	leaq	(, INCX, SIZE), INCX
diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c
index 37a92468f..3f22cb97a 100644
--- a/kernel/x86_64/sasum.c
+++ b/kernel/x86_64/sasum.c
@@ -9,7 +9,7 @@
 
 #endif
 
-#if defined(SKYLAKEX)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "sasum_microk_skylakex-2.c"
 #elif defined(HASWELL) || defined(ZEN)
 #include "sasum_microk_haswell-2.c"
diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c
index 2eb5b9538..3b4d65cfc 100644
--- a/kernel/x86_64/sasum_microk_haswell-2.c
+++ b/kernel/x86_64/sasum_microk_haswell-2.c
@@ -1,4 +1,7 @@
-#if (( defined(__GNUC__)  && __GNUC__   > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__)
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SASUM_KERNEL 1
 
diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c
index fbc91b558..f193053ee 100644
--- a/kernel/x86_64/sasum_microk_skylakex-2.c
+++ b/kernel/x86_64/sasum_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SASUM_KERNEL 1
 
diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c
index 950f10ba2..bbe4d2bc5 100644
--- a/kernel/x86_64/saxpy_microk_skylakex-2.c
+++ b/kernel/x86_64/saxpy_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_16 1
 
diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c
index 2aefe46ff..ccec98e34 100644
--- a/kernel/x86_64/sbdot_microk_cooperlake.c
+++ b/kernel/x86_64/sbdot_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SBDOT_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
index 90e0a32c7..5ee3c8532 100644
--- a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
+++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c
@@ -97,33 +97,32 @@ typedef struct {
 #define T_C10	6
 #define T_C11	7
 
-// FIXME: gcc11 seem have problem in tile load/store address calc,
-// need to multiply with element size (2 or 4) here.
+
 #define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2)
 #define LOAD_A_TAIL(M, N) {\
 	__m256i ymm = _mm256_loadu_epi16(ptr_a##M); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
-	_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
+	_tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \
 }
 #define MASK_LOAD_A_TAIL(M, N) {\
 	__m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
-	_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
+	_tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \
 }
 #define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2)
 #define LOAD_B_TAIL(M, N) {\
 	__m256i ymm = _mm256_loadu_epi16(ptr_b##N); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
-	_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
+	_tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \
 }
 #define MASK_LOAD_B_TAIL(M, N) {\
 	__m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \
 	__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
 	_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
-	_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
+	_tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \
 }
 
 #define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N)
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake.c b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
index d875e0d96..c87f9fa5b 100644
--- a/kernel/x86_64/sbgemv_n_microk_cooperlake.c
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SBGEMV_N_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake.c b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
index 23da2e809..5b7a2e147 100644
--- a/kernel/x86_64/sbgemv_t_microk_cooperlake.c
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SBGEMV_T_ACCL_KERNEL 1
 
diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c
index 1fcb7f27c..f14632f94 100644
--- a/kernel/x86_64/sdot_microk_skylakex-2.c
+++ b/kernel/x86_64/sdot_microk_skylakex-2.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_KERNEL_16 1
 
diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
index 2366fe3aa..6f4309c30 100644
--- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c
@@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
index 308f5e35e..987b090ba 100644
--- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
+++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c
@@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 #include "common.h"
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index c9681fa8b..296eded5a 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/sgemv_n_microk_skylakex-8.c b/kernel/x86_64/sgemv_n_microk_skylakex-8.c
index fba9cedcd..199621712 100644
--- a/kernel/x86_64/sgemv_n_microk_skylakex-8.c
+++ b/kernel/x86_64/sgemv_n_microk_skylakex-8.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && __NVCOMPVERS >= 2203 )
+
 
 #define HAVE_SGEMV_N_SKYLAKE_KERNEL 1
 #include "common.h"
@@ -255,4 +259,4 @@ static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLAS
 }
 
 
-#endif
\ No newline at end of file
+#endif
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 07aa51503..ea89a2aaf 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/sgemv_t_microk_skylakex.c b/kernel/x86_64/sgemv_t_microk_skylakex.c
index dca12acfc..d4f675a1e 100644
--- a/kernel/x86_64/sgemv_t_microk_skylakex.c
+++ b/kernel/x86_64/sgemv_t_microk_skylakex.c
@@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SGEMV_T_SKYLAKE_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
index a49544616..05724b427 100644
--- a/kernel/x86_64/srot.c
+++ b/kernel/x86_64/srot.c
@@ -171,7 +171,7 @@ static int rot_thread_function(blas_arg_t *args)
     return 0;
 }
 
-extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(void), int nthreads);
 #endif
 int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
 {
diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c
index a21d1cf64..aec25ac56 100644
--- a/kernel/x86_64/srot_microk_skylakex-2.c
+++ b/kernel/x86_64/srot_microk_skylakex-2.c
@@ -1,5 +1,8 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
 
 #define HAVE_SROT_KERNEL 1
 
diff --git a/kernel/x86_64/sscal_microk_skylakex-2.c b/kernel/x86_64/sscal_microk_skylakex-2.c
index c4fa160f0..5c13cba55 100644
--- a/kernel/x86_64/sscal_microk_skylakex-2.c
+++ b/kernel/x86_64/sscal_microk_skylakex-2.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c
index 45914daf5..4826b00c6 100644
--- a/kernel/x86_64/ssymv_L.c
+++ b/kernel/x86_64/ssymv_L.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c
index 26e5ca7e9..06db14ebe 100644
--- a/kernel/x86_64/ssymv_U.c
+++ b/kernel/x86_64/ssymv_U.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/stobf16_microk_cooperlake.c b/kernel/x86_64/stobf16_microk_cooperlake.c
index 2756a6934..e7d20ddfa 100644
--- a/kernel/x86_64/stobf16_microk_cooperlake.c
+++ b/kernel/x86_64/stobf16_microk_cooperlake.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #define HAVE_TOBF16_ACCL_KERNEL 1
 #include "common.h"
diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c
index 80e95a2c8..3f17ab1cf 100644
--- a/kernel/x86_64/zasum.c
+++ b/kernel/x86_64/zasum.c
@@ -4,7 +4,7 @@
 #define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
 #endif
 
-#if defined(SKYLAKEX)
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
 #include "zasum_microk_skylakex-2.c"
 #endif
 
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
index e257a5456..f6bc8e37b 100644
--- a/kernel/x86_64/zasum_microk_skylakex-2.c
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -1,5 +1,10 @@
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
+
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
 
 #define HAVE_ZASUM_KERNEL 1
 
@@ -16,16 +21,14 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
 
     if (n2 < 32) {
         __m128d accum_10, accum_11, accum_12, accum_13;
-        __m128d abs_mask1 = abs_mask1;
+        __m128d abs_mask1;
 
         accum_10 = _mm_setzero_pd();
         accum_11 = _mm_setzero_pd();
         accum_12 = _mm_setzero_pd();
         accum_13 = _mm_setzero_pd();
         
-        // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
-        abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
-        abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
+        abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
                 
         _mm_prefetch(&x1[0], _MM_HINT_T0);
         if (n2 >= 16){
@@ -338,3 +341,4 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
     return sumf;
 }
 #endif
+#endif
diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S
index a7dd054fb..3776c8910 100644
--- a/kernel/x86_64/zaxpy_sse2.S
+++ b/kernel/x86_64/zaxpy_sse2.S
@@ -1418,10 +1418,10 @@
 	movq	M,  %rax
 //If incx==0 || incy==0, avoid unloop and jump to end.
 	cmpq	$0, INCX
-	je  .L58
+	jne  	.L59
 	cmpq	$0, INCY
 	je	.L58
-
+.L59:
 	sarq	$3, %rax
 	jle	.L55
 
diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index 27397ccfa..51efa2dfe 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
@@ -92,7 +92,7 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
 #if defined(SMP)
 extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
         BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
-        void *c, BLASLONG ldc, int (*function)(), int nthreads);
+        void *c, BLASLONG ldc, int (*function)(void), int nthreads);
 #endif
                 
                 
diff --git a/kernel/x86_64/zgemv_n_4.c b/kernel/x86_64/zgemv_n_4.c
index 8fc960610..678cea957 100644
--- a/kernel/x86_64/zgemv_n_4.c
+++ b/kernel/x86_64/zgemv_n_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c
index 63c8b11a4..44d545df7 100644
--- a/kernel/x86_64/zgemv_t_4.c
+++ b/kernel/x86_64/zgemv_t_4.c
@@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "common.h"
 
-#if (defined(OS_DARWIN) || defined(OS_WINDOWS)) && (defined(__GNUC__) && __GNUC__ > 11) 
+#if (defined(__GNUC__) && __GNUC__ > 11) 
 #pragma GCC optimize("no-tree-vectorize")
 #endif
 
diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S
index 0d2aa3480..748fde310 100644
--- a/kernel/x86_64/znrm2.S
+++ b/kernel/x86_64/znrm2.S
@@ -58,7 +58,7 @@
 	testq	M, M
 	jle	.L999
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	salq	$ZBASE_SHIFT, INCX
 
diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S
index f78b83f7e..2274f2e98 100644
--- a/kernel/x86_64/znrm2_sse.S
+++ b/kernel/x86_64/znrm2_sse.S
@@ -58,7 +58,7 @@
 	jle	.L999
 	pxor	%xmm1, %xmm1
 	testq	INCX, INCX
-	jle	.L999
+	je	.L999
 
 	xorq	FLAG, FLAG
 
diff --git a/kernel/x86_64/zscal.c b/kernel/x86_64/zscal.c
index 45e3531b8..bc79c0caf 100644
--- a/kernel/x86_64/zscal.c
+++ b/kernel/x86_64/zscal.c
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #include "common.h"
-
+#include <float.h>
 
 #if defined (SKYLAKEX) || defined (COOPERLAKE) || defined (SAPPHIRERAPIDS)
 #include "zscal_microk_skylakex-2.c"
@@ -69,16 +69,16 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 = da_r *x[0] - da_i *x[1];	
-		t1 = da_r *x[2] - da_i *x[3];	
-		t2 = da_r *x[4] - da_i *x[5];	
-		t3 = da_r *x[6] - da_i *x[7];	
+		t0 = da_r *x[0] - da_i *x[1];
+		t1 = da_r *x[2] - da_i *x[3];
+		t2 = da_r *x[4] - da_i *x[5];
+		t3 = da_r *x[6] - da_i *x[7];
 
 		x[1] = da_r * x[1] + da_i * x[0];
 		x[3] = da_r * x[3] + da_i * x[2];
 		x[5] = da_r * x[5] + da_i * x[4];
 		x[7] = da_r * x[7] + da_i * x[6];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -99,16 +99,16 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 =  - da_i *x[1];	
-		t1 =  - da_i *x[3];	
-		t2 =  - da_i *x[5];	
-		t3 =  - da_i *x[7];	
+		t0 =  - da_i *x[1];
+		t1 =  - da_i *x[3];
+		t2 =  - da_i *x[5];
+		t3 =  - da_i *x[7];
 
 		x[1] =  da_i * x[0];
 		x[3] =  da_i * x[2];
 		x[5] =  da_i * x[4];
 		x[7] =  da_i * x[6];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -129,16 +129,16 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha , FLOAT *x )
 
 	for( i=0; i<n; i+=4 )
 	{
-		t0 = da_r *x[0];	
-		t1 = da_r *x[2];	
-		t2 = da_r *x[4];	
-		t3 = da_r *x[6];	
+		t0 = da_r *x[0];
+		t1 = da_r *x[2];
+		t2 = da_r *x[4];
+		t3 = da_r *x[6];
 
 		x[1] = da_r * x[1];
 		x[3] = da_r * x[3];
 		x[5] = da_r * x[5];
 		x[7] = da_r * x[7];
-		
+
 		x[0] = t0;
 		x[2] = t1;
 		x[4] = t2;
@@ -157,14 +157,14 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x )
 	BLASLONG i;
 	for( i=0; i<n; i+=4 )
 	{
-		x[0] = 0.0;	
-		x[1] = 0.0;	
-		x[2] = 0.0;	
-		x[3] = 0.0;	
-		x[4] = 0.0;	
-		x[5] = 0.0;	
-		x[6] = 0.0;	
-		x[7] = 0.0;	
+		x[0] = 0.0;
+		x[1] = 0.0;
+		x[2] = 0.0;
+		x[3] = 0.0;
+		x[4] = 0.0;
+		x[5] = 0.0;
+		x[6] = 0.0;
+		x[7] = 0.0;
 		x+=8;
 	}
 
@@ -186,10 +186,10 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
 
 	for ( i=0; i<n; i+=4 )
 	{
-		t0 = da_r * x[0]         - da_i *x[1];	
-		t1 = da_r * x[inc_x]     - da_i *x[inc_x  + 1];	
-		t2 = da_r * x[inc_x2]    - da_i *x[inc_x2 + 1];	
-		t3 = da_r * x[inc_x3]    - da_i *x[inc_x3 + 1];	
+		t0 = da_r * x[0]         - da_i *x[1];
+		t1 = da_r * x[inc_x]     - da_i *x[inc_x  + 1];
+		t2 = da_r * x[inc_x2]    - da_i *x[inc_x2 + 1];
+		t3 = da_r * x[inc_x3]    - da_i *x[inc_x3 + 1];
 
 		x[1]               = da_i * x[0]       + da_r * x[1];
 		x[inc_x  +1]       = da_i * x[inc_x]   + da_r * x[inc_x  +1];
@@ -222,15 +222,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 		if ( da_r == 0.0 )
 		{
-
 			BLASLONG n1 = n & -2;
 
 			if ( da_i == 0.0 )
 			{
-
 				while(j < n1)
 				{
-			
+
 					x[i]=0.0;
 					x[i+1]=0.0;
 					x[i+inc_x]=0.0;
@@ -242,7 +240,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
+
 					x[i]=0.0;
 					x[i+1]=0.0;
 					i += inc_x ;
@@ -253,14 +251,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 			}
 			else
 			{
-
 				while(j < n1)
 				{
-			
-					temp0        = -da_i * x[i+1];
+
+					if (isnan(x[i]) || isinf(x[i]))
+						temp0	= NAN;
+					else
+						temp0   = -da_i * x[i+1];
 					x[i+1]       =  da_i * x[i];
 					x[i]         =  temp0;
-					temp1        = -da_i * x[i+1+inc_x];
+					if (isnan(x[i+inc_x]) || isinf(x[i+inc_x]))
+						temp1	= NAN;
+					else
+						temp1   = -da_i * x[i+1+inc_x];
 					x[i+1+inc_x] =  da_i * x[i+inc_x];
 					x[i+inc_x]   =  temp1;
 					i += 2*inc_x ;
@@ -270,8 +273,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
-					temp0        = -da_i * x[i+1];
+
+					if (isnan(x[i]) || isinf(x[i]))
+						temp0	= NAN;
+					else
+						temp0   = -da_i * x[i+1];
 					x[i+1]       =  da_i * x[i];
 					x[i]         =  temp0;
 					i += inc_x ;
@@ -294,7 +300,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n1)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -308,7 +314,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 				while(j < n)
 				{
-			
+
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
@@ -356,49 +362,59 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 		alpha[0] = da_r;
 		alpha[1] = da_i;
-	
+
 		if ( da_r == 0.0 )
 			if ( da_i == 0 )
 				zscal_kernel_8_zero(n1 , alpha , x);
 			else
-				zscal_kernel_8_zero_r(n1 , alpha , x);
+//				zscal_kernel_8_zero_r(n1 , alpha , x);
+				zscal_kernel_8(n1 , alpha , x);
 		else
-			if ( da_i == 0 )
+			if ( da_i == 0 && da_r == da_r)
 				zscal_kernel_8_zero_i(n1 , alpha , x);
 			else
 				zscal_kernel_8(n1 , alpha , x);
-
+		}
 		i = n1 << 1;
 		j = n1;
-	}
-
 
-	if ( da_r == 0.0 )
+	if ( da_r == 0.0 || da_r != da_r )
 	{
-
 		if ( da_i == 0.0 )
 		{
-
+		FLOAT res=0.0;
+		if (da_r != da_r) res= da_r;
 			while(j < n)
 			{
-		
-					x[i]=0.0;
-					x[i+1]=0.0;
+					x[i]=res;
+					x[i+1]=res;
 					i += 2 ;
 					j++;
 
 			}
 
 		}
-		else
+		else if (da_r < -FLT_MAX || da_r > FLT_MAX) {
+			while(j < n)
+			{
+					x[i]= NAN;
+					x[i+1] = da_r;
+					i += 2 ;
+					j++;
+
+			}
+
+		} else
 		{
 
 			while(j < n)
 			{
-			
 				temp0        = -da_i * x[i+1];
+				if (x[i] < -FLT_MAX || x[i] > FLT_MAX)
+					temp0 = NAN;
 				x[i+1]       =  da_i * x[i];
-				x[i]         =  temp0;
+				if ( x[i] == x[i]) //preserve NaN
+				  x[i]         =  temp0;
 				i += 2 ;
 				j++;
 
@@ -409,28 +425,24 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 	}
 	else
 	{
-
-		if ( da_i == 0.0 )
+		if (da_i == 0.0)
 		{
+				while(j < n)
+				{
 
-			while(j < n)
-			{
-			
 					temp0        =  da_r * x[i];
 					x[i+1]       =  da_r * x[i+1];
 					x[i]         =  temp0;
 					i += 2 ;
 					j++;
 
-			}
-
+				}
 		}
 		else
 		{
 
 			while(j < n)
 			{
-
 				temp0        =  da_r * x[i]   - da_i * x[i+1];
 				x[i+1]       =  da_r * x[i+1] + da_i * x[i];
 				x[i]         =  temp0;
@@ -439,11 +451,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
 			}
 
-		}		
+		}
 
 	}
 
 	return(0);
 }
-
-
diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c
index f9e05e333..29dc4f6df 100644
--- a/kernel/x86_64/zscal_microk_skylakex-2.c
+++ b/kernel/x86_64/zscal_microk_skylakex-2.c
@@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 /* need a new enough GCC for avx512 support */
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 )
+
 
 #include <immintrin.h>
 
diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S
index 223b1e439..d6a49136d 100644
--- a/kernel/x86_64/zscal_sse2.S
+++ b/kernel/x86_64/zscal_sse2.S
@@ -82,6 +82,7 @@
 	pxor	%xmm15, %xmm15
 	comisd	%xmm0, %xmm15
 	jne	.L100
+	jp	.L100
 
 	comisd	%xmm1, %xmm15
 	jne	.L100
diff --git a/kernel/x86_64/zsum.c b/kernel/x86_64/zsum.c
new file mode 100644
index 000000000..5973c1253
--- /dev/null
+++ b/kernel/x86_64/zsum.c
@@ -0,0 +1,131 @@
+#include "common.h"
+
+#if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
+#include "zsum_microk_skylakex-2.c"
+#endif
+
+#ifndef HAVE_ZASUM_KERNEL
+static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
+{
+
+    BLASLONG i=0;
+    BLASLONG n_8 = n & -8;
+    FLOAT *x1 = x;
+    FLOAT temp0, temp1, temp2, temp3;
+    FLOAT temp4, temp5, temp6, temp7;
+    FLOAT sum0 = 0.0;
+    FLOAT sum1 = 0.0;
+    FLOAT sum2 = 0.0;
+    FLOAT sum3 = 0.0;
+    FLOAT sum4 = 0.0;
+    
+    while (i < n_8) {
+        sum0 += x1[0];
+        sum1 += x1[1];
+        sum2 += x1[2];
+        sum3 += x1[3];
+        
+        sum0 += x1[4];
+        sum1 += x1[5];
+        sum2 += x1[6];
+        sum3 += x1[7];
+        
+        x1+=8;
+        i+=4;
+    }
+
+     while (i < n) {
+        sum4 += x1[0] + x1[1];
+        x1 += 2;
+        i++;
+     }
+
+    return sum0+sum1+sum2+sum3+sum4;
+}
+
+#endif
+
+static FLOAT sum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ip = 0;
+    BLASLONG inc_x2;
+    FLOAT sumf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(sumf);
+    if (inc_x == 1) {
+        sumf = zsum_kernel(n, x);
+    }
+    else {
+        inc_x2 = 2 * inc_x;
+
+        while (i < n) {
+            sumf += x[ip] + x[ip + 1];
+            ip += inc_x2;
+            i++;
+        }
+    }
+
+    return(sumf);
+}
+
+#if defined(SMP)
+static int sum_thread_function(BLASLONG n, 
+        BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
+        FLOAT *x, BLASLONG inc_x,
+        FLOAT * dummy3, BLASLONG dummy4,
+        FLOAT * result, BLASLONG dummy5)
+{
+    *(FLOAT *) result = sum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, 
+        BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
+        void *a, BLASLONG lda, 
+        void *b, BLASLONG ldb,
+        void *c, BLASLONG ldc,
+        int (*function)(),
+        int nthread);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha[2];
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 10000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
+    
+    if (nthreads == 1) {
+        sumf = sum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_COMPLEX;
+#else
+        mode = BLAS_DOUBLE | BLAS_COMPLEX;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
+                NULL, 0, result, 0, (int (*)(void))sum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = sum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
diff --git a/kernel/x86_64/zsum_microk_skylakex-2.c b/kernel/x86_64/zsum_microk_skylakex-2.c
new file mode 100644
index 000000000..0bca7ce6d
--- /dev/null
+++ b/kernel/x86_64/zsum_microk_skylakex-2.c
@@ -0,0 +1,280 @@
+/* need a new enough GCC for avx512 support */
+#ifdef __NVCOMPILER
+#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
+#endif
+#if ((( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203))
+
+#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2203))
+
+#define HAVE_ZSUM_KERNEL 1
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+static FLOAT zsum_kernel(BLASLONG n, FLOAT *x)
+{
+    FLOAT *x1 = x;
+    FLOAT sumf=0.0;
+    BLASLONG n2 = n + n;
+    
+
+    if (n2 < 32) {
+        __m128d accum_10, accum_11, accum_12, accum_13;
+
+        accum_10 = _mm_setzero_pd();
+        accum_11 = _mm_setzero_pd();
+        accum_12 = _mm_setzero_pd();
+        accum_13 = _mm_setzero_pd();
+        
+        _mm_prefetch(&x1[0], _MM_HINT_T0);
+        if (n2 >= 16){
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            __m128d x02 = _mm_loadu_pd(&x1[ 4]);
+            __m128d x03 = _mm_loadu_pd(&x1[ 6]);
+            
+            _mm_prefetch(&x1[8], _MM_HINT_T0);
+            __m128d x04 = _mm_loadu_pd(&x1[ 8]);
+            __m128d x05 = _mm_loadu_pd(&x1[10]);
+            __m128d x06 = _mm_loadu_pd(&x1[12]);
+            __m128d x07 = _mm_loadu_pd(&x1[14]);
+
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+            accum_12 = _mm_add_pd(accum_12, x02);
+            accum_13 = _mm_add_pd(accum_13, x03);
+
+            accum_10 = _mm_add_pd(accum_10, x04);
+            accum_11 = _mm_add_pd(accum_11, x05);
+            accum_12 = _mm_add_pd(accum_12, x06);
+            accum_13 = _mm_add_pd(accum_13, x07);
+
+            x1 += 16;
+            n2 -= 16;
+        }
+
+        if (n2 >= 8) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            __m128d x02 = _mm_loadu_pd(&x1[ 4]);
+            __m128d x03 = _mm_loadu_pd(&x1[ 6]);
+
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+            accum_12 = _mm_add_pd(accum_12, x02);
+            accum_13 = _mm_add_pd(accum_13, x03);
+            
+            n2 -= 8;
+            x1 += 8;
+        }
+
+        if (n2 >= 4) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+
+            n2 -= 4;
+            x1 += 4;
+        }
+        
+        if (n2) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            accum_10 = _mm_add_pd(accum_10, x00);
+        }
+
+        accum_10 = _mm_add_pd(accum_10, accum_11);
+        accum_12 = _mm_add_pd(accum_12, accum_13);
+        accum_10 = _mm_add_pd(accum_10, accum_12);
+
+        accum_10 = _mm_hadd_pd(accum_10, accum_10);
+
+        sumf = accum_10[0];
+    }
+    else {
+        __m512d accum_0, accum_1, accum_2, accum_3;
+        __m512d x00, x01, x02, x03, x04, x05, x06, x07;
+        __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff);
+        
+        accum_0 = _mm512_setzero_pd();
+        accum_1 = _mm512_setzero_pd();
+        accum_2 = _mm512_setzero_pd();
+        accum_3 = _mm512_setzero_pd();
+
+        // alignment has side-effect when the size of input array is not large enough
+        if (n2 < 128) {
+            if (n2 >= 64) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x02 = _mm512_loadu_pd(&x1[16]);
+                x03 = _mm512_loadu_pd(&x1[24]);
+                x04 = _mm512_loadu_pd(&x1[32]);
+                x05 = _mm512_loadu_pd(&x1[40]);
+                x06 = _mm512_loadu_pd(&x1[48]);
+                x07 = _mm512_loadu_pd(&x1[56]);
+
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+                
+                accum_0 = _mm512_add_pd(accum_0, x04);
+                accum_1 = _mm512_add_pd(accum_1, x05);
+                accum_2 = _mm512_add_pd(accum_2, x06);
+                accum_3 = _mm512_add_pd(accum_3, x07);
+                
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x02 = _mm512_loadu_pd(&x1[16]);
+                x03 = _mm512_loadu_pd(&x1[24]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2 >= 8) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= 8;
+                x1 += 8;
+            }
+
+            if (n2) {
+                unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
+                x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+            }
+            accum_0 = _mm512_add_pd(accum_0, accum_1);
+            accum_2 = _mm512_add_pd(accum_2, accum_3);
+            accum_0 = _mm512_add_pd(accum_0, accum_2);
+            sumf =  _mm512_reduce_add_pd(accum_0);
+        }
+        // n2 >= 128, doing alignment
+        else {
+
+            int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7;
+
+            if (0 != align_header) {
+                unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header));
+                x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= align_header;
+                x1 += align_header;
+            }
+
+            x00 = _mm512_load_pd(&x1[ 0]);
+            x01 = _mm512_load_pd(&x1[ 8]);
+            x02 = _mm512_load_pd(&x1[16]);
+            x03 = _mm512_load_pd(&x1[24]);
+            x04 = _mm512_load_pd(&x1[32]);
+            x05 = _mm512_load_pd(&x1[40]);
+            x06 = _mm512_load_pd(&x1[48]);
+            x07 = _mm512_load_pd(&x1[56]);
+            
+            n2 -= 64;
+            x1 += 64;
+
+            while (n2 >= 64) {
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                x00 = _mm512_load_pd(&x1[ 0]);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                x02 = _mm512_load_pd(&x1[16]);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+                x03 = _mm512_load_pd(&x1[24]);
+                
+                accum_0 = _mm512_add_pd(accum_0, x04);
+                x04 = _mm512_load_pd(&x1[32]);
+                accum_1 = _mm512_add_pd(accum_1, x05);
+                x05 = _mm512_load_pd(&x1[40]);
+                accum_2 = _mm512_add_pd(accum_2, x06);
+                x06 = _mm512_load_pd(&x1[48]);
+                accum_3 = _mm512_add_pd(accum_3, x07);
+                x07 = _mm512_load_pd(&x1[56]);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+            
+            accum_0 = _mm512_add_pd(accum_0, x00);
+            accum_1 = _mm512_add_pd(accum_1, x01);
+            accum_2 = _mm512_add_pd(accum_2, x02);
+            accum_3 = _mm512_add_pd(accum_3, x03);
+            
+            accum_0 = _mm512_add_pd(accum_0, x04);
+            accum_1 = _mm512_add_pd(accum_1, x05);
+            accum_2 = _mm512_add_pd(accum_2, x06);
+            accum_3 = _mm512_add_pd(accum_3, x07);
+
+            if (n2 >= 32) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                x02 = _mm512_load_pd(&x1[16]);
+                x03 = _mm512_load_pd(&x1[24]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2 >= 8) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= 8;
+                x1 += 8;
+            }
+
+            if (n2) {
+                unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
+                x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+            }
+
+            accum_0 = _mm512_add_pd(accum_0, accum_1);
+            accum_2 = _mm512_add_pd(accum_2, accum_3);
+            accum_0 = _mm512_add_pd(accum_0, accum_2);
+            sumf = _mm512_reduce_add_pd(accum_0);
+        }
+    }
+
+    return sumf;
+}
+#endif
+#endif
diff --git a/kernel/x86_64/zsum_sse.S b/kernel/x86_64/zsum_sse.S
new file mode 100644
index 000000000..b679b42b0
--- /dev/null
+++ b/kernel/x86_64/zsum_sse.S
@@ -0,0 +1,299 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M	ARG1	/* rdi */
+#define X	ARG2	/* rsi */
+#define INCX	ARG3	/* rdx */
+
+#define I	%rax
+
+#include "l1param.h"
+
+	PROLOGUE
+	PROFCODE
+
+	SAVEREGISTERS
+
+	pxor	%xmm0, %xmm0
+	testq	M, M
+	jle	.L999
+	testq	INCX, INCX
+	jle	.L999
+
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+
+	salq	$ZBASE_SHIFT, INCX
+
+	cmpq	$2 * SIZE, INCX
+	jne	.L100
+
+	subq	$-32 * SIZE, X
+	addq	M, M
+
+	cmpq	$3, M
+	jle	.L18
+
+	testq	$4, X
+	je	.L05
+	movss	-32 * SIZE(X), %xmm0
+	addq	$SIZE, X
+	decq	M
+	jle	.L998
+	ALIGN_3
+
+.L05:
+	testq	$8, X
+	je	.L10
+
+#ifdef movsd
+	xorps	%xmm1, %xmm1
+#endif
+	movsd	-32 * SIZE(X), %xmm1
+	addq	$2 * SIZE, X
+	subq	$2, M
+	jle	.L998
+	ALIGN_3
+
+.L10:
+	movq	M,  I
+	sarq	$5, I
+	jle	.L14
+
+	movaps	-32 * SIZE(X), %xmm4
+	movaps	-28 * SIZE(X), %xmm5
+	movaps	-24 * SIZE(X), %xmm6
+	movaps	-20 * SIZE(X), %xmm7
+
+	movaps	-16 * SIZE(X), %xmm8
+	movaps	-12 * SIZE(X), %xmm9
+	movaps	 -8 * SIZE(X), %xmm10
+	movaps	 -4 * SIZE(X), %xmm11
+	decq	I
+	jle	.L12
+	ALIGN_3
+
+.L11:
+#ifdef PREFETCH
+	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
+#endif
+
+	addps	%xmm4,  %xmm0
+	movaps	  0 * SIZE(X), %xmm4
+
+	addps	%xmm5,  %xmm1
+	movaps	  4 * SIZE(X), %xmm5
+
+	addps	%xmm6,  %xmm2
+	movaps	  8 * SIZE(X), %xmm6
+
+	addps	%xmm7,  %xmm3
+	movaps	 12 * SIZE(X), %xmm7
+
+#if defined(PREFETCH) && !defined(FETCH128)
+	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
+#endif
+
+	addps	%xmm8,  %xmm0
+	movaps	 16 * SIZE(X), %xmm8
+
+	addps	%xmm9,  %xmm1
+	movaps	 20 * SIZE(X), %xmm9
+
+	addps	%xmm10, %xmm2
+	movaps	 24 * SIZE(X), %xmm10
+
+	addps	%xmm11, %xmm3
+	movaps	 28 * SIZE(X), %xmm11
+
+	subq	$-32 * SIZE, X
+	decq	I
+	jg	.L11
+	ALIGN_3
+
+.L12:
+	addps	%xmm4,  %xmm0
+	addps	%xmm5,  %xmm1
+
+	addps	%xmm6,  %xmm2
+	addps	%xmm7,  %xmm3
+
+	addps	%xmm8,  %xmm0
+	addps	%xmm9,  %xmm1
+
+	addps	%xmm10, %xmm2
+	addps	%xmm11, %xmm3
+
+	addq	$32 * SIZE, X
+	ALIGN_3
+
+.L14:
+	testq	$31,  M
+	jle	.L998
+
+.L15:
+	testq	$16, M
+	je	.L16
+
+	movaps	-32 * SIZE(X), %xmm4
+	addps	%xmm4, %xmm0
+
+	movaps	-28 * SIZE(X), %xmm5
+	addps	%xmm5, %xmm1
+
+	movaps	-24 * SIZE(X), %xmm4
+	addps	%xmm4, %xmm0
+
+	movaps	-20 * SIZE(X), %xmm5
+	addps	%xmm5, %xmm1
+
+	addq	$16 * SIZE, X
+	ALIGN_3
+
+.L16:
+	testq	$8, M
+	je	.L17
+
+	movaps	-32 * SIZE(X), %xmm4
+	addps	%xmm4, %xmm0
+
+	movaps	-28 * SIZE(X), %xmm5
+	addps	%xmm5, %xmm1
+
+	addq	$8 * SIZE, X
+	ALIGN_3
+
+.L17:
+	testq	$4, M
+	je	.L18
+
+	movaps	-32 * SIZE(X), %xmm6
+	addps	%xmm6, %xmm2
+	addq	$4 * SIZE, X
+	ALIGN_3
+
+.L18:
+	testq	$2, M
+	je	.L19
+
+#ifdef movsd
+	xorps	%xmm7, %xmm7
+#endif
+	movsd	-32 * SIZE(X), %xmm7
+	addps	%xmm7, %xmm3
+	addq	$2 * SIZE, X
+	ALIGN_3
+
+.L19:
+	testq	$1, M
+	je	.L998
+
+	movss	-32 * SIZE(X), %xmm6
+	addps	%xmm6, %xmm2
+	jmp	.L998
+	ALIGN_4
+
+.L100:
+	movq	M,  I
+	sarq	$2, I
+	jle	.L105
+	ALIGN_4
+
+.L101:
+	movsd	(X), %xmm4
+	addq	INCX, X
+	movhps	(X), %xmm4
+	addq	INCX, X
+
+	addps	%xmm4,  %xmm0
+
+	movsd	(X), %xmm5
+	addq	INCX, X
+	movhps	(X), %xmm5
+	addq	INCX, X
+
+	addps	%xmm5,  %xmm1
+
+	decq	I
+	jg	.L101
+	ALIGN_4
+
+.L105:
+#ifdef movsd
+	xorps	%xmm4, %xmm4
+#endif
+	andq	$3,  M
+	jle	.L998
+	ALIGN_4
+
+.L106:
+	movsd	(X), %xmm4
+	addps	%xmm4, %xmm0
+	addq	INCX, X
+	decq	M
+	jg	.L106
+	ALIGN_4
+
+.L998:
+	addps	%xmm1, %xmm0
+	addps	%xmm3, %xmm2
+	addps	%xmm2, %xmm0
+
+#ifndef HAVE_SSE3
+	movhlps	%xmm0, %xmm1
+	addps	%xmm1, %xmm0
+
+	movaps	%xmm0, %xmm1
+	shufps  $1, %xmm0, %xmm0
+	addss	 %xmm1, %xmm0
+#else
+	haddps	%xmm0, %xmm0
+	haddps	%xmm0, %xmm0
+#endif
+	ALIGN_4
+
+.L999:
+	RESTOREREGISTERS
+
+	ret
+
+	EPILOGUE
diff --git a/kernel/x86_64/zsum_sse2.S b/kernel/x86_64/zsum_sse2.S
new file mode 100644
index 000000000..6f667164d
--- /dev/null
+++ b/kernel/x86_64/zsum_sse2.S
@@ -0,0 +1,283 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define M	ARG1	/* rdi */
+#define X	ARG2	/* rsi */
+#define INCX	ARG3	/* rdx */
+
+#define I	%rax
+
+#include "l1param.h"
+
+	PROLOGUE
+	PROFCODE
+
+	SAVEREGISTERS
+
+	xorps	%xmm0, %xmm0
+	testq	M, M
+	jle	.L999
+	testq	INCX, INCX
+	jle	.L999
+
+	xorps	%xmm1, %xmm1
+	xorps	%xmm2, %xmm2
+	xorps	%xmm3, %xmm3
+
+	salq	$ZBASE_SHIFT, INCX
+
+	cmpq	$2 * SIZE, INCX
+	jne	.L40
+
+	subq	$-16 * SIZE, X
+	addq	M, M
+
+	testq	$SIZE, X
+	je	.L05
+
+#ifdef movsd
+	xorps	%xmm0, %xmm0
+#endif
+	movsd	-16 * SIZE(X), %xmm0
+	addq	$SIZE, X
+
+	subq	$1, M
+	jle	.L999
+	ALIGN_3
+
+.L05:
+	movq	M,  I
+	sarq	$4, I
+	jle	.L20
+
+	movaps	-16 * SIZE(X), %xmm4
+	movaps	-14 * SIZE(X), %xmm5
+	movaps	-12 * SIZE(X), %xmm6
+	movaps	-10 * SIZE(X), %xmm7
+
+	movaps	 -8 * SIZE(X), %xmm8
+	movaps	 -6 * SIZE(X), %xmm9
+	movaps	 -4 * SIZE(X), %xmm10
+	movaps	 -2 * SIZE(X), %xmm11
+
+	decq	I
+	jle	.L11
+	ALIGN_4
+
+.L10:
+#ifdef PREFETCH
+	PREFETCH (PREFETCHSIZE +  0) - PREOFFSET(X)
+#endif
+
+	addpd	%xmm4,  %xmm0
+	movaps	  0 * SIZE(X), %xmm4
+
+	addpd	%xmm5,  %xmm1
+	movaps	  2 * SIZE(X), %xmm5
+
+	addpd	%xmm6,  %xmm2
+	movaps	  4 * SIZE(X), %xmm6
+
+	addpd	%xmm7,  %xmm3
+	movaps	  6 * SIZE(X), %xmm7
+
+#if defined(PREFETCH) && !defined(FETCH128)
+	PREFETCH (PREFETCHSIZE +  64) - PREOFFSET(X)
+#endif
+
+	addpd	%xmm8,  %xmm0
+	movaps	  8 * SIZE(X), %xmm8
+
+	addpd	%xmm9,  %xmm1
+	movaps	 10 * SIZE(X), %xmm9
+
+	addpd	%xmm10, %xmm2
+	movaps	 12 * SIZE(X), %xmm10
+
+	addpd	%xmm11, %xmm3
+	movaps	 14 * SIZE(X), %xmm11
+
+	subq	$-16 * SIZE, X
+	decq	I
+	jg	.L10
+	ALIGN_4
+
+.L11:
+
+	addpd	%xmm4,  %xmm0
+	addpd	%xmm5,  %xmm1
+	addpd	%xmm6,  %xmm2
+	addpd	%xmm7,  %xmm3
+
+	addpd	%xmm8,  %xmm0
+	addpd	%xmm9,  %xmm1
+	addpd	%xmm10, %xmm2
+	addpd	%xmm11, %xmm3
+
+	subq	$-16 * SIZE, X
+	ALIGN_3
+
+.L20:
+	andq	$15,  M
+	jle	.L998
+
+	testq	$8, M
+	je	.L21
+
+	movaps	-16 * SIZE(X), %xmm4
+	movaps	-14 * SIZE(X), %xmm5
+	movaps	-12 * SIZE(X), %xmm6
+	movaps	-10 * SIZE(X), %xmm7
+
+	addpd	%xmm4, %xmm0
+	addpd	%xmm5, %xmm1
+	addpd	%xmm6, %xmm2
+	addpd	%xmm7, %xmm3
+	addq	$8 * SIZE, X
+	ALIGN_3
+
+.L21:
+	testq	$4, M
+	je	.L22
+
+	movaps	-16 * SIZE(X), %xmm4
+	movaps	-14 * SIZE(X), %xmm5
+
+	addpd	%xmm4, %xmm0
+	addpd	%xmm5, %xmm1
+
+	addq	$4 * SIZE, X
+	ALIGN_3
+
+.L22:
+	testq	$2, M
+	je	.L23
+
+	movaps	-16 * SIZE(X), %xmm6
+	addpd	%xmm6, %xmm3
+	addq	$2 * SIZE, X
+
+.L23:
+	testq	$1, M
+	je	.L998
+
+#ifdef movsd
+	xorps	%xmm4, %xmm4
+#endif
+	movsd	-16 * SIZE(X), %xmm4
+	addsd	%xmm4, %xmm0
+	jmp	.L998
+	ALIGN_3
+
+
+.L40:
+	movq	M,  I
+	sarq	$2, I
+	jle	.L60
+	ALIGN_4
+
+.L50:
+#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI)
+	prefetcht0	PREFETCHSIZE * SIZE(X)
+#endif
+
+#ifdef PENTIUM4
+	prefetchnta	PREFETCHSIZE * SIZE(X)
+#endif
+
+	movsd	0 * SIZE(X), %xmm4
+	movhpd	1 * SIZE(X), %xmm4
+	addq	INCX, X
+	addpd	%xmm4, %xmm0
+
+	movsd	0 * SIZE(X), %xmm5
+	movhpd	1 * SIZE(X), %xmm5
+	addq	INCX, X
+	addpd	%xmm5, %xmm1
+
+	movsd	0 * SIZE(X), %xmm6
+	movhpd	1 * SIZE(X), %xmm6
+	addq	INCX, X
+	addpd	%xmm6, %xmm2
+
+	movsd	0 * SIZE(X), %xmm7
+	movhpd	1 * SIZE(X), %xmm7
+	addq	INCX, X
+	addpd	%xmm7, %xmm3
+
+	decq	I
+	jg	.L50
+	ALIGN_4
+
+.L60:
+	andq	$3,  M
+	jle	.L998
+	ALIGN_4
+
+
+.L61:
+	movsd	0 * SIZE(X), %xmm4
+	movhpd	1 * SIZE(X), %xmm4
+	addpd	%xmm4, %xmm0
+	addq	INCX, X
+	decq	M
+	jg	.L61
+	ALIGN_4
+
+.L998:
+	addpd	%xmm1, %xmm0
+	addpd	%xmm3, %xmm2
+	addpd	%xmm2, %xmm0
+
+#ifndef HAVE_SSE3
+	movhlps	%xmm0, %xmm1
+	addsd	%xmm1, %xmm0
+#else
+	haddpd	%xmm0, %xmm0
+#endif
+	ALIGN_4
+
+.L999:
+	RESTOREREGISTERS
+
+	ret
+
+	EPILOGUE
diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13
index 3bcc32197..fe82d81e6 100644
--- a/kernel/zarch/KERNEL.Z13
+++ b/kernel/zarch/KERNEL.Z13
@@ -35,10 +35,10 @@ DASUMKERNEL  = dasum.c
 CASUMKERNEL  = ../arm/zasum.c
 ZASUMKERNEL  = zasum.c
 
-SSUMKERNEL  = ../arm/asum.c
-DSUMKERNEL  = dasum.c
-CSUMKERNEL  = ../arm/zasum.c
-ZSUMKERNEL  = zasum.c
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = dsum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = zsum.c
 
 SAXPYKERNEL  = ../arm/axpy.c
 DAXPYKERNEL  = daxpy.c
diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c
index d39b8447e..4160a1a76 100644
--- a/kernel/zarch/zscal.c
+++ b/kernel/zarch/zscal.c
@@ -233,9 +233,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
         while (j < n1) {
 
-          temp0 = -da_i * x[i + 1];
+	  if (isnan(x[i]) || isinf(x[i]))
+		temp0 = NAN;
+	  else
+		temp0 = -da_i * x[i + 1];
           x[i + 1] = da_i * x[i];
           x[i] = temp0;
+	  if (isnan(x[i + inc_x]) || isinf(x[i + inc_x]))
+		temp1 = NAN;
+	  else
           temp1 = -da_i * x[i + 1 + inc_x];
           x[i + 1 + inc_x] = da_i * x[i + inc_x];
           x[i + inc_x] = temp1;
@@ -246,7 +252,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
         while (j < n) {
 
-          temp0 = -da_i * x[i + 1];
+	  if (isnan(x[i]) || isinf(x[i]))
+		temp0 = NAN;
+	  else
+          	temp0 = -da_i * x[i + 1];
           x[i + 1] = da_i * x[i];
           x[i] = temp0;
           i += inc_x;
@@ -320,7 +329,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
       if (da_i == 0)
         zscal_kernel_8_zero(n1, x);
       else
-        zscal_kernel_8_zero_r(n1, alpha, x);
+        zscal_kernel_8(n1, da_r, da_i, x);
     else if (da_i == 0)
       zscal_kernel_8_zero_i(n1, alpha, x);
     else
@@ -347,7 +356,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
       while (j < n) {
 
-        temp0 = -da_i * x[i + 1];
+	if (isnan(x[i]) || isinf(x[i]))
+		temp0 = NAN;
+	  else
+        	temp0 = -da_i * x[i + 1];
         x[i + 1] = da_i * x[i];
         x[i] = temp0;
         i += 2;
diff --git a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake
index 585ca26e7..17c0df80e 100644
--- a/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake
+++ b/lapack-netlib/CMAKE/CheckFortranTypeSizes.cmake
@@ -1,4 +1,4 @@
-# This module perdorms several try-compiles to determine the default integer
+# This module performs several try-compiles to determine the default integer
 # size being used by the fortran compiler
 #
 # After execution, the following variables are set.  If they are un set then
diff --git a/lapack-netlib/CMAKE/Findcodecov.cmake b/lapack-netlib/CMAKE/Findcodecov.cmake
index 384064007..93db45130 100644
--- a/lapack-netlib/CMAKE/Findcodecov.cmake
+++ b/lapack-netlib/CMAKE/Findcodecov.cmake
@@ -36,7 +36,7 @@ function(add_coverage TNAME)
 endfunction()
 
 
-# Find the reuired flags foreach language.
+# Find the required flags foreach language.
 set(CMAKE_REQUIRED_QUIET_SAVE ${CMAKE_REQUIRED_QUIET})
 set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY})
 
@@ -118,7 +118,7 @@ function (codecov_path_of_source FILE RETURN_VAR)
 
   # If expression was found, SOURCEFILE is a generator-expression for an
   # object library. Currently we found no way to call this function automatic
-  # for the referenced target, so it must be called in the directoryso of the
+  # for the referenced target, so it must be called in the directory of the
   # object library definition.
   if(NOT "${_source}" STREQUAL "")
     set(${RETURN_VAR} "" PARENT_SCOPE)
diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt
index b704e72c5..f1f47ae24 100644
--- a/lapack-netlib/CMakeLists.txt
+++ b/lapack-netlib/CMakeLists.txt
@@ -1,16 +1,20 @@
-cmake_minimum_required(VERSION 2.8.12)
+cmake_minimum_required(VERSION 3.6)
 
-project(LAPACK Fortran C)
+project(LAPACK)
 
 set(LAPACK_MAJOR_VERSION 3)
-set(LAPACK_MINOR_VERSION 9)
+set(LAPACK_MINOR_VERSION 12)
 set(LAPACK_PATCH_VERSION 0)
 set(
   LAPACK_VERSION
   ${LAPACK_MAJOR_VERSION}.${LAPACK_MINOR_VERSION}.${LAPACK_PATCH_VERSION}
   )
 
-# Add the CMake directory for custon CMake modules
+# Allow setting a prefix for the library names
+set(CMAKE_STATIC_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}")
+set(CMAKE_SHARED_LIBRARY_PREFIX "lib${LIBRARY_PREFIX}")
+
+# Add the CMake directory for custom CMake modules
 set(CMAKE_MODULE_PATH "${LAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH})
 
 # Export all symbols on Windows when building shared libraries
@@ -41,6 +45,48 @@ if(_is_coverage_build)
   find_package(codecov)
 endif()
 
+# Use valgrind if it is found
+option( LAPACK_TESTING_USE_PYTHON "Use Python for testing. Disable it on memory checks." ON )
+find_program( MEMORYCHECK_COMMAND valgrind )
+if( MEMORYCHECK_COMMAND )
+  message( STATUS "Found valgrind: ${MEMORYCHECK_COMMAND}" )
+  set( MEMORYCHECK_COMMAND_OPTIONS "--leak-check=full --show-leak-kinds=all --track-origins=yes" )
+endif()
+
+# By default test Fortran compiler complex abs and complex division
+option(TEST_FORTRAN_COMPILER "Test Fortran compiler complex abs and complex division" OFF)
+if( TEST_FORTRAN_COMPILER )
+
+  add_executable( test_zcomplexabs ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f )
+  add_custom_target( run_test_zcomplexabs
+    COMMAND test_zcomplexabs 2> test_zcomplexabs.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zcomplexabs in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexabs.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexabs.f )
+
+  add_executable( test_zcomplexdiv ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f )
+  add_custom_target( run_test_zcomplexdiv
+    COMMAND test_zcomplexdiv 2> test_zcomplexdiv.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zcomplexdiv in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexdiv.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexdiv.f )
+
+  add_executable( test_zcomplexmult ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f )
+  add_custom_target( run_test_zcomplexmult
+    COMMAND test_zcomplexmult 2> test_zcomplexmult.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zcomplexmult in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zcomplexmult.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zcomplexmult.f )
+
+  add_executable( test_zminMax ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f )
+  add_custom_target( run_test_zminMax
+    COMMAND test_zminMax 2> test_zminMax.err
+    WORKING_DIRECTORY ${LAPACK_BINARY_DIR}/INSTALL
+    COMMENT "Running test_zminMax in ${LAPACK_BINARY_DIR}/INSTALL with stderr: test_zminMax.err"
+    SOURCES ${LAPACK_SOURCE_DIR}/INSTALL/test_zminMax.f )
+
+endif()
+
 # By default static library
 option(BUILD_SHARED_LIBS "Build shared libraries" OFF)
 
@@ -61,6 +107,8 @@ else()
   set(LAPACKELIB "lapacke")
   set(TMGLIB "tmglib")
 endif()
+# By default build standard API and extended _64 API
+option(BUILD_INDEX64_EXT_API "Build Index-64 API as extended API with _64 suffix" ON)
 
 include(GNUInstallDirs)
 
@@ -89,46 +137,23 @@ configure_file(
 include(PreventInSourceBuilds)
 include(PreventInBuildInstalls)
 
-if(UNIX)
-  if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
-  endif()
-  if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none")
-  endif()
-# Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
-# This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
-  string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}")
-endif()
-
-if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq)
-  if(WIN32)
-    if(CMAKE_GENERATOR STREQUAL "NMake Makefiles")
-      get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE)
-      message(STATUS "Using Compaq Fortran compiler with command name ${CMAKE_Fortran_COMPILER_CMDNAM}")
-      set(cmd ${CMAKE_Fortran_COMPILER_CMDNAM})
-      string(TOLOWER "${cmd}" cmdlc)
-      if(cmdlc STREQUAL "df")
-        message(STATUS "Assume the Compaq Visual Fortran Compiler is being used")
-        set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
-        set(CMAKE_Fortran_USE_RESPONSE_FILE_FOR_INCLUDES 1)
-        #This is a workaround that is needed to avoid forward-slashes in the
-        #filenames listed in response files from incorrectly being interpreted as
-        #introducing compiler command options
-        if(${BUILD_SHARED_LIBS})
-          message(FATAL_ERROR "Making of shared libraries with CVF has not been tested.")
-        endif()
-        set(str "NMake version 9 or later should be used. NMake version 6.0 which is\n")
-        set(str "${str}   included with the CVF distribution fails to build Lapack because\n")
-        set(str "${str}   the number of source files exceeds the limit for NMake v6.0\n")
-        message(STATUS ${str})
-        set(CMAKE_Fortran_LINK_EXECUTABLE "LINK /out:<TARGET> <LINK_FLAGS> <LINK_LIBRARIES> <OBJECTS>")
-      endif()
+# Add option to enable flat namespace for symbol resolution on macOS
+if(APPLE)
+  option(USE_FLAT_NAMESPACE "Use flat namespaces for symbol resolution during build and runtime." OFF)
+
+  if(USE_FLAT_NAMESPACE)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-flat_namespace")
+    set(CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS} -Wl,-flat_namespace")
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-flat_namespace")
+  else()
+    if(BUILD_SHARED_LIBS AND BUILD_TESTING)
+      message(WARNING
+        "LAPACK test suite might fail with shared libraries and the default two-level namespace. "
+        "Disable shared libraries or enable flat namespace for symbol resolution via -DUSE_FLAT_NAMESPACE=ON.")
     endif()
   endif()
 endif()
 
-
 # --------------------------------------------------
 set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKLIB}-targets)
 
@@ -149,13 +174,14 @@ option(BUILD_TESTING "Build tests" ${_is_coverage_build})
 include(CTest)
 message(STATUS "Build tests: ${BUILD_TESTING}")
 
-# lapack_testing.py uses features from python 2.7 and greater
 if(BUILD_TESTING)
-  set(_msg "Looking for Python >= 2.7 needed for summary tests")
+  set(_msg "Looking for Python3 needed for summary tests")
   message(STATUS "${_msg}")
-  find_package(PythonInterp 2.7 QUIET)
-  if(PYTHONINTERP_FOUND)
-    message(STATUS "${_msg} - found (${PYTHON_VERSION_STRING})")
+  # find_package(PythonInterp 3) cannot be used because /usr/bin/python may be
+  # a Python2 interpreter.
+  find_program(PYTHON_EXECUTABLE python3)
+  if(PYTHON_EXECUTABLE)
+    message(STATUS "${_msg} - found")
   else()
     message(STATUS "${_msg} - not found (skipping summary tests)")
   endif()
@@ -168,26 +194,6 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/bin)
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LAPACK_BINARY_DIR}/lib)
 
-# --------------------------------------------------
-# Check for any necessary platform specific compiler flags
-include(CheckLAPACKCompilerFlags)
-CheckLAPACKCompilerFlags()
-
-# --------------------------------------------------
-# Check second function
-
-include(CheckTimeFunction)
-set(TIME_FUNC NONE ${TIME_FUNC})
-CHECK_TIME_FUNCTION(NONE TIME_FUNC)
-CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
-CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
-CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC)
-CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC)
-message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.")
-
-set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f)
-set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f)
-
 # deprecated LAPACK and LAPACKE routines
 option(BUILD_DEPRECATED "Build deprecated routines" OFF)
 message(STATUS "Build deprecated routines: ${BUILD_DEPRECATED}")
@@ -210,6 +216,7 @@ if(NOT (BUILD_SINGLE OR BUILD_DOUBLE OR BUILD_COMPLEX OR BUILD_COMPLEX16))
   BUILD_SINGLE, BUILD_DOUBLE, BUILD_COMPLEX, BUILD_COMPLEX16.")
 endif()
 
+
 # --------------------------------------------------
 # Subdirectories that need to be processed
 option(USE_OPTIMIZED_BLAS "Whether or not to use an optimized BLAS library instead of included netlib BLAS" OFF)
@@ -279,18 +286,27 @@ endif()
 
 # Check the usage of the user provided or automatically found LAPACK libraries
 if(LAPACK_LIBRARIES)
-  include(CheckFortranFunctionExists)
-  set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
-  # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES
-  CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND)
-  unset(CMAKE_REQUIRED_LIBRARIES)
-  if(LATESTLAPACK_FOUND)
-    message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.")
+  include(CheckLanguage)
+  check_language(Fortran)
+  if(CMAKE_Fortran_COMPILER)
+    enable_language(Fortran)
+    include(CheckFortranFunctionExists)
+    set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
+    # Check if new routine of 3.4.0 is in LAPACK_LIBRARIES
+    CHECK_FORTRAN_FUNCTION_EXISTS("dgeqrt" LATESTLAPACK_FOUND)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    if(LATESTLAPACK_FOUND)
+      message(STATUS "--> LAPACK supplied by user is WORKING, will use ${LAPACK_LIBRARIES}.")
+    else()
+      message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.")
+      message(ERROR "-->     Will use REFERENCE LAPACK (by default)")
+      message(ERROR "-->     Or Correct your LAPACK_LIBRARIES entry ")
+      message(ERROR "-->     Or Consider checking USE_OPTIMIZED_LAPACK")
+    endif()
   else()
-    message(ERROR "--> LAPACK supplied by user is not WORKING or is older than LAPACK 3.4.0, CANNOT USE ${LAPACK_LIBRARIES}.")
-    message(ERROR "-->     Will use REFERENCE LAPACK (by default)")
-    message(ERROR "-->     Or Correct your LAPACK_LIBRARIES entry ")
-    message(ERROR "-->     Or Consider checking USE_OPTIMIZED_LAPACK")
+    message(STATUS "--> LAPACK supplied by user is ${LAPACK_LIBRARIES}.")
+    message(STATUS "--> CMake couldn't find a Fortran compiler, so it cannot check if the provided LAPACK library works.")
+    set(LATESTLAPACK_FOUND TRUE)
   endif()
 endif()
 
@@ -298,6 +314,27 @@ endif()
 if(NOT LATESTLAPACK_FOUND)
   message(STATUS "Using supplied NETLIB LAPACK implementation")
   set(LAPACK_LIBRARIES ${LAPACKLIB})
+
+  enable_language(Fortran)
+
+  # Check for any necessary platform specific compiler flags
+  include(CheckLAPACKCompilerFlags)
+  CheckLAPACKCompilerFlags()
+
+  # Check second function
+  include(CheckTimeFunction)
+  set(TIME_FUNC NONE)
+  CHECK_TIME_FUNCTION(NONE TIME_FUNC)
+  CHECK_TIME_FUNCTION(INT_CPU_TIME TIME_FUNC)
+  CHECK_TIME_FUNCTION(EXT_ETIME TIME_FUNC)
+  CHECK_TIME_FUNCTION(EXT_ETIME_ TIME_FUNC)
+  CHECK_TIME_FUNCTION(INT_ETIME TIME_FUNC)
+
+  # Set second function
+  message(STATUS "--> Will use second_${TIME_FUNC}.f and dsecnd_${TIME_FUNC}.f as timing function.")
+  set(SECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/second_${TIME_FUNC}.f)
+  set(DSECOND_SRC ${LAPACK_SOURCE_DIR}/INSTALL/dsecnd_${TIME_FUNC}.f)
+
   add_subdirectory(SRC)
 else()
   set(CMAKE_EXE_LINKER_FLAGS
@@ -325,35 +362,88 @@ option(LAPACKE_WITH_TMG "Build LAPACKE with tmglib routines" OFF)
 if(LAPACKE_WITH_TMG)
   set(LAPACKE ON)
 endif()
-if(BUILD_TESTING OR LAPACKE_WITH_TMG) #already included, avoid double inclusion
+
+# TMGLIB
+# Cache export target
+set(LAPACK_INSTALL_EXPORT_NAME_CACHE ${LAPACK_INSTALL_EXPORT_NAME})
+if(BUILD_TESTING OR LAPACKE_WITH_TMG)
+  enable_language(Fortran)
+  if(LATESTLAPACK_FOUND AND LAPACKE_WITH_TMG)
+    set(CMAKE_REQUIRED_LIBRARIES ${LAPACK_LIBRARIES})
+    # Check if dlatms (part of tmg) is found
+    include(CheckFortranFunctionExists)
+    CHECK_FORTRAN_FUNCTION_EXISTS("dlatms" LAPACK_WITH_TMGLIB_FOUND)
+    unset(CMAKE_REQUIRED_LIBRARIES)
+    if(NOT LAPACK_WITH_TMGLIB_FOUND)
+      # Build and install TMG as part of LAPACKE targets (as opposed to LAPACK
+      # targets)
+      set(LAPACK_INSTALL_EXPORT_NAME ${LAPACKELIB}-targets)
+    endif()
+  endif()
   add_subdirectory(TESTING/MATGEN)
 endif()
+# Reset export target
+set(LAPACK_INSTALL_EXPORT_NAME ${LAPACK_INSTALL_EXPORT_NAME_CACHE})
+unset(LAPACK_INSTALL_EXPORT_NAME_CACHE)
+
+
+#-------------------------------------
+# LAPACKE
+# Include lapack.h and lapacke_mangling.h even if LAPACKE is not built
+add_subdirectory(LAPACKE/include)
 
 if(LAPACKE)
   add_subdirectory(LAPACKE)
 endif()
 
+
 #-------------------------------------
 # BLAS++ / LAPACK++
 option(BLAS++ "Build BLAS++" OFF)
 option(LAPACK++ "Build LAPACK++" OFF)
- 
- 
+
+
 function(_display_cpp_implementation_msg name)
   string(TOLOWER ${name} name_lc)
   message(STATUS "${name}++ enable")
   message(STATUS "----------------")
   message(STATUS "Thank you for your interest in ${name}++, a newly developed C++ API for ${name} library")
   message(STATUS "The objective of ${name}++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc.")
-  message(STATUS "We are still working on integrating ${name}++ in our library. For the moment, you can download directly ${name_lc}++ from https://bitbucket.org/icl/${name_lc}pp")
   message(STATUS "For support ${name}++ related question, please email: slate-user@icl.utk.edu")
   message(STATUS "----------------")
 endfunction()
-if(BLAS++)
+if (BLAS++)
   _display_cpp_implementation_msg("BLAS")
+  include(ExternalProject)
+  ExternalProject_Add(blaspp
+	  URL https://bitbucket.org/icl/blaspp/downloads/blaspp-2020.10.02.tar.gz
+	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} ${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
+	  INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
+	  )
+  ExternalProject_Add_StepDependencies(blaspp build ${BLAS_LIBRARIES})
 endif()
-if(LAPACK++)
+if (LAPACK++)
+	message (STATUS "linking lapack++ against ${LAPACK_LIBRARIES}")
   _display_cpp_implementation_msg("LAPACK")
+  include(ExternalProject)
+  if (BUILD_SHARED_LIBS)
+  ExternalProject_Add(lapackpp
+	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
+	  CONFIGURE_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES} -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
+	  BUILD_COMMAND ${CMAKE_COMMAND} -E env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
+	  INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
+	  )
+  else ()
+# FIXME this does not really work as the libraries list gets converted to a semicolon-separated list somewhere in the lapack++ build files
+  ExternalProject_Add(lapackpp
+	  URL https://bitbucket.org/icl/lapackpp/downloads/lapackpp-2020.10.02.tar.gz
+	  CONFIGURE_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${CMAKE_BINARY_DIR}/lib LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_BINARY_DIR} -DCMAKE_INSTALL_LIBDIR=lib -DLAPACK_LIBRARIES="${PROJECT_BINARY_DIR}/lib/liblapack.a -lgfortran"  -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS}  ${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp
+	  BUILD_COMMAND env LIBRARY_PATH=$ENV{LIBRARY_PATH}:${PROJECT_BINARY_DIR}/lib LIB_SUFFIX="" ${CMAKE_COMMAND} --build .
+	  INSTALL_COMMAND ${CMAKE_COMMAND} -E env PREFIX=${PROJECT_BINARY_DIR} LIB_SUFFIX="" ${CMAKE_COMMAND} --install .
+  )
+  endif()
+  ExternalProject_Add_StepDependencies(lapackpp build blaspp ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
 endif()
 
 # --------------------------------------------------
@@ -370,7 +460,7 @@ set(CPACK_MONOLITHIC_INSTALL ON)
 set(CPACK_PACKAGE_INSTALL_DIRECTORY "LAPACK")
 if(WIN32 AND NOT UNIX)
   # There is a bug in NSI that does not handle full unix paths properly. Make
-  # sure there is at least one set of four (4) backlasshes.
+  # sure there is at least one set of four (4) backslashes.
   set(CPACK_NSIS_HELP_LINK "http:\\\\\\\\http://icl.cs.utk.edu/lapack-forum")
   set(CPACK_NSIS_URL_INFO_ABOUT "http:\\\\\\\\www.netlib.org/lapack")
   set(CPACK_NSIS_CONTACT "lapack@eecs.utk.edu")
@@ -396,10 +486,6 @@ if(NOT LATESTLAPACK_FOUND)
   set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKLIB})
 endif()
 
-if(BUILD_TESTING OR LAPACKE_WITH_TMG)
-  set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB})
-endif()
-
 # Export lapack targets, not including lapacke, from the
 # install tree, if any.
 set(_lapack_config_install_guard_target "")
@@ -424,6 +510,10 @@ if(LAPACKE)
   set(ALL_TARGETS ${ALL_TARGETS} ${LAPACKELIB})
 endif()
 
+if(NOT LAPACK_WITH_TMGLIB_FOUND AND LAPACKE_WITH_TMG)
+  set(ALL_TARGETS ${ALL_TARGETS} ${TMGLIB})
+endif()
+
 # Export lapack and lapacke targets from the build tree, if any.
 set(_lapack_config_build_guard_target "")
 if(ALL_TARGETS)
@@ -461,4 +551,142 @@ install(FILES
   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/${LAPACKLIB}-${LAPACK_VERSION}
   COMPONENT Development
   )
-  
+if (LAPACK++)
+  install(
+  DIRECTORY "${LAPACK_BINARY_DIR}/lib/"
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  FILES_MATCHING REGEX "liblapackpp.(a|so)$"
+  )
+  install(
+    DIRECTORY "${PROJECT_BINARY_DIR}/lapackpp-prefix/src/lapackpp/include/"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    FILES_MATCHING REGEX "\\.(h|hh)$"
+  )
+  write_basic_package_version_file(
+    "lapackppConfigVersion.cmake"
+    VERSION 2020.10.02
+    COMPATIBILITY AnyNewerVersion
+  )
+  install(
+    FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfig.cmake"
+          "${CMAKE_CURRENT_BINARY_DIR}/lib/lapackpp/lapackppConfigVersion.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/"
+  )
+
+endif()
+if (BLAS++)
+  write_basic_package_version_file(
+    "blasppConfigVersion.cmake"
+    VERSION 2020.10.02
+    COMPATIBILITY AnyNewerVersion
+  )
+  install(
+    FILES "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfig.cmake"
+          "${CMAKE_CURRENT_BINARY_DIR}/lib/blaspp/blasppConfigVersion.cmake"
+    DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/"
+  )
+  install(
+  DIRECTORY "${LAPACK_BINARY_DIR}/lib/"
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  FILES_MATCHING REGEX "libblaspp.(a|so)$"
+  )
+  install(
+    DIRECTORY "${PROJECT_BINARY_DIR}/blaspp-prefix/src/blaspp/include/"
+    DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
+    FILES_MATCHING REGEX "\\.(h|hh)$"
+  )
+endif()
+
+# --------------------------------------------------
+# Generate MAN and/or HTML Documentation
+option(BUILD_HTML_DOCUMENTATION "Create and install the HTML based API
+documentation (requires Doxygen) - command: make html" OFF)
+option(BUILD_MAN_DOCUMENTATION "Create and install the MAN based documentation (requires Doxygen) - command: make man" OFF)
+message(STATUS "Build html documentation: ${BUILD_HTML_DOCUMENTATION}")
+message(STATUS "Build man documentation: ${BUILD_MAN_DOCUMENTATION}")
+
+if(BUILD_HTML_DOCUMENTATION OR BUILD_MAN_DOCUMENTATION)
+    find_package(Doxygen)
+    if(NOT DOXYGEN_FOUND)
+         message(WARNING "Doxygen is needed to build the documentation.")
+
+    else()
+
+    set(DOXYGEN_PROJECT_BRIEF "LAPACK: Linear Algebra PACKage")
+    set(DOXYGEN_PROJECT_NUMBER ${LAPACK_VERSION})
+    set(DOXYGEN_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/DOCS)
+    set(DOXYGEN_PROJECT_LOGO ${CMAKE_CURRENT_SOURCE_DIR}/DOCS/lapack.png)
+    set(DOXYGEN_OPTIMIZE_FOR_FORTRAN YES)
+    set(DOXYGEN_SOURCE_BROWSER YES)
+    set(DOXYGEN_CREATE_SUBDIRS YES)
+    set(DOXYGEN_SEPARATE_MEMBER_PAGES YES)
+    set(DOXYGEN_EXTRACT_ALL YES)
+    set(DOXYGEN_FILE_PATTERNS *.f *.f90 *.c *.h )
+    set(DOXYGEN_RECURSIVE YES)
+    set(DOXYGEN_GENERATE_TREEVIEW YES)
+    set(DOXYGEN_DOT_IMAGE_FORMAT svg)
+    set(DOXYGEN_INTERACTIVE_SVG YES)
+    set(DOXYGEN_QUIET NO)
+    set(DOXYGEN_WARNINGS YES)
+    set(DOXYGEN_WARN_NO_PARAMDOC YES)
+    set(DOXYGEN_WARN_LOGFILE doxygen_error)
+    set(DOXYGEN_GENERATE_HTML NO)
+    set(DOXYGEN_GENERATE_MAN NO)
+    set(DOXYGEN_LAYOUT_FILE "DOCS/DoxygenLayout.xml")
+
+    # Exclude functions that are duplicated, creating conflicts.
+    set(DOXYGEN_EXCLUDE .git
+                        .github
+                        SRC/VARIANTS
+                        BLAS/SRC/lsame.f
+                        BLAS/SRC/xerbla.f
+                        BLAS/SRC/xerbla_array.f
+                        INSTALL/slamchf77.f
+                        INSTALL/dlamchf77.f )
+
+    if (BUILD_HTML_DOCUMENTATION)
+        set(DOXYGEN_GENERATE_HTML YES)
+        set(DOXYGEN_HTML_OUTPUT explore-html)
+        set(DOXYGEN_INLINE_SOURCES YES)
+        set(DOXYGEN_CALL_GRAPH YES)
+        set(DOXYGEN_CALLER_GRAPH YES)
+
+        doxygen_add_docs(
+            html
+
+            # Doxygen INPUT =
+            ${PROJECT_SOURCE_DIR}/README.md
+            ${PROJECT_SOURCE_DIR}/BLAS
+            ${PROJECT_SOURCE_DIR}/CBLAS
+            ${PROJECT_SOURCE_DIR}/SRC
+            ${PROJECT_SOURCE_DIR}/INSTALL
+            ${PROJECT_SOURCE_DIR}/TESTING
+            ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox
+
+            COMMENT "Generating html LAPACK documentation (it will take some time... time to grab a coffee)"
+        )
+    endif()
+    if (BUILD_MAN_DOCUMENTATION)
+        set(DOXYGEN_GENERATE_MAN YES)
+        set(DOXYGEN_MAN_LINKS YES)
+        set(DOXYGEN_INLINE_SOURCES NO)
+        set(DOXYGEN_CALL_GRAPH NO)
+        set(DOXYGEN_CALLER_GRAPH NO)
+
+        doxygen_add_docs(
+            man
+
+            # Doxygen INPUT =
+            ${PROJECT_SOURCE_DIR}/BLAS
+            ${PROJECT_SOURCE_DIR}/CBLAS
+            ${PROJECT_SOURCE_DIR}/SRC
+            ${PROJECT_SOURCE_DIR}/INSTALL
+            ${PROJECT_SOURCE_DIR}/TESTING
+            ${PROJECT_SOURCE_DIR}/DOCS/groups-usr.dox
+
+            COMMENT "Generating man LAPACK documentation"
+        )
+    endif()
+
+    endif()
+endif()
diff --git a/lapack-netlib/DOCS/Doxyfile b/lapack-netlib/DOCS/Doxyfile
index 43cea43b5..577675772 100644
--- a/lapack-netlib/DOCS/Doxyfile
+++ b/lapack-netlib/DOCS/Doxyfile
@@ -1,4 +1,4 @@
-# Doxyfile 1.8.10
+# Doxyfile 1.9.1
 
 # This file describes the settings to be used by the documentation system
 # doxygen (www.doxygen.org) for a project.
@@ -17,11 +17,11 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all text
-# before the first occurrence of this tag. Doxygen uses libiconv (or the iconv
-# built into libc) for the transcoding. See http://www.gnu.org/software/libiconv
-# for the list of possible encodings.
+# This tag specifies the encoding used for all characters in the configuration
+# file that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# https://www.gnu.org/software/libiconv/ for the list of possible encodings.
 # The default value is: UTF-8.
 
 DOXYFILE_ENCODING      = UTF-8
@@ -38,7 +38,7 @@ PROJECT_NAME           = LAPACK
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 3.9.0
+PROJECT_NUMBER         = 3.12.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -93,6 +93,14 @@ ALLOW_UNICODE_NAMES    = NO
 
 OUTPUT_LANGUAGE        = English
 
+# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all generated output in the proper direction.
+# Possible values are: None, LTR, RTL and Context.
+# The default value is: None.
+
+OUTPUT_TEXT_DIRECTION  = None
+
 # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member
 # descriptions after the members that are listed in the file and class
 # documentation (similar to Javadoc). Set to NO to disable this.
@@ -179,6 +187,16 @@ SHORT_NAMES            = NO
 
 JAVADOC_AUTOBRIEF      = NO
 
+# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line
+# such as
+# /***************
+# as being the beginning of a Javadoc-style comment "banner". If set to NO, the
+# Javadoc-style will behave just like regular comments and it will not be
+# interpreted by doxygen.
+# The default value is: NO.
+
+JAVADOC_BANNER         = NO
+
 # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first
 # line (until the first dot) of a Qt-style comment as the brief description. If
 # set to NO, the Qt-style will behave just like regular Qt-style comments (thus
@@ -199,6 +217,14 @@ QT_AUTOBRIEF           = NO
 
 MULTILINE_CPP_IS_BRIEF = NO
 
+# By default Python docstrings are displayed as preformatted text and doxygen's
+# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the
+# doxygen's special commands can be used and the contents of the docstring
+# documentation blocks is shown as doxygen documentation.
+# The default value is: YES.
+
+PYTHON_DOCSTRING       = YES
+
 # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the
 # documentation from any documented member that it re-implements.
 # The default value is: YES.
@@ -226,16 +252,15 @@ TAB_SIZE               = 8
 # will allow you to put the command \sideeffect (or @sideeffect) in the
 # documentation, which will result in a user-defined paragraph with heading
 # "Side Effects:". You can put \n's in the value part of an alias to insert
-# newlines.
+# newlines (in the resulting output). You can put ^^ in the value part of an
+# alias to insert a newline as if a physical newline was in the original file.
+# When you need a literal { or } or , in the value part of an alias you have to
+# escape them by means of a backslash (\), this can lead to conflicts with the
+# commands \{ and \} for these it is advised to use the version @{ and @} or use
+# a double escape (\\{ and \\})
 
 ALIASES                =
 
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding "class=itcl::class"
-# will allow you to use the command class in the itcl::class meaning.
-
-TCL_SUBST              =
-
 # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
 # only. Doxygen will then generate output that is more tailored for C. For
 # instance, some of the names that are used will be different. The list of all
@@ -264,28 +289,40 @@ OPTIMIZE_FOR_FORTRAN   = YES
 
 OPTIMIZE_OUTPUT_VHDL   = NO
 
+# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice
+# sources only. Doxygen will then generate output that is more tailored for that
+# language. For instance, namespaces will be presented as modules, types will be
+# separated into more groups, etc.
+# The default value is: NO.
+
+OPTIMIZE_OUTPUT_SLICE  = NO
+
 # Doxygen selects the parser to use depending on the extension of the files it
 # parses. With this tag you can assign which parser to use for a given
 # extension. Doxygen has a built-in mapping, but you can override or extend it
 # using this tag. The format is ext=language, where ext is a file extension, and
-# language is one of the parsers supported by doxygen: IDL, Java, Javascript,
-# C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran:
-# FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran:
-# Fortran. In the later case the parser tries to guess whether the code is fixed
-# or free formatted code, this is the default for Fortran type files), VHDL. For
-# instance to make doxygen treat .inc files as Fortran files (default is PHP),
-# and .f files as C (default is Fortran), use: inc=Fortran f=C.
+# language is one of the parsers supported by doxygen: IDL, Java, JavaScript,
+# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL,
+# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran:
+# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser
+# tries to guess whether the code is fixed or free formatted code, this is the
+# default for Fortran type files). For instance to make doxygen treat .inc files
+# as Fortran files (default is PHP), and .f files as C (default is Fortran),
+# use: inc=Fortran f=C.
 #
 # Note: For files without extension you can use no_extension as a placeholder.
 #
 # Note that for custom extensions you also need to set FILE_PATTERNS otherwise
-# the files are not read by doxygen.
+# the files are not read by doxygen. When specifying no_extension you should add
+# * to the FILE_PATTERNS.
+#
+# Note see also the list of default file extension mappings.
 
 EXTENSION_MAPPING      =
 
 # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments
 # according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# documentation. See https://daringfireball.net/projects/markdown/ for details.
 # The output of markdown processing is further processed by doxygen, so you can
 # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in
 # case of backward compatibilities issues.
@@ -293,6 +330,15 @@ EXTENSION_MAPPING      =
 
 MARKDOWN_SUPPORT       = YES
 
+# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up
+# to that level are automatically included in the table of contents, even if
+# they do not have an id attribute.
+# Note: This feature currently applies only to Markdown headings.
+# Minimum value: 0, maximum value: 99, default value: 5.
+# This tag requires that the tag MARKDOWN_SUPPORT is set to YES.
+
+TOC_INCLUDE_HEADINGS   = 5
+
 # When enabled doxygen tries to link words that correspond to documented
 # classes, or namespaces to their corresponding documentation. Such a link can
 # be prevented in individual cases by putting a % sign in front of the word or
@@ -318,7 +364,7 @@ BUILTIN_STL_SUPPORT    = NO
 CPP_CLI_SUPPORT        = NO
 
 # Set the SIP_SUPPORT tag to YES if your project consists of sip (see:
-# http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen
+# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen
 # will parse them like normal C++ but will assume all classes use public instead
 # of private inheritance when no explicit protection keyword is present.
 # The default value is: NO.
@@ -341,7 +387,7 @@ IDL_PROPERTY_SUPPORT   = YES
 # all members of a group must be documented explicitly.
 # The default value is: NO.
 
-DISTRIBUTE_GROUP_DOC   = YES
+DISTRIBUTE_GROUP_DOC   = NO
 
 # If one adds a struct or class to a group and this option is enabled, then also
 # any nested class or struct is added to the same group. By default this option
@@ -404,6 +450,19 @@ TYPEDEF_HIDES_STRUCT   = NO
 
 LOOKUP_CACHE_SIZE      = 0
 
+# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use
+# during processing. When set to 0 doxygen will based this on the number of
+# cores available in the system. You can set it explicitly to a value larger
+# than 0 to get more control over the balance between CPU load and processing
+# speed. At this moment only the input processing can be done using multiple
+# threads. Since this is still an experimental feature the default is set to 1,
+# which efficively disables parallel processing. Please report any issues you
+# encounter. Generating dot graphs in parallel is controlled by the
+# DOT_NUM_THREADS setting.
+# Minimum value: 0, maximum value: 32, default value: 1.
+
+NUM_PROC_THREADS       = 1
+
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
@@ -424,6 +483,12 @@ EXTRACT_ALL            = YES
 
 EXTRACT_PRIVATE        = NO
 
+# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual
+# methods of a class will be included in the documentation.
+# The default value is: NO.
+
+EXTRACT_PRIV_VIRTUAL   = NO
+
 # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal
 # scope will be included in the documentation.
 # The default value is: NO.
@@ -461,6 +526,13 @@ EXTRACT_LOCAL_METHODS  = NO
 
 EXTRACT_ANON_NSPACES   = NO
 
+# If this flag is set to YES, the name of an unnamed parameter in a declaration
+# will be determined by the corresponding definition. By default unnamed
+# parameters remain unnamed in the output.
+# The default value is: YES.
+
+RESOLVE_UNNAMED_PARAMS = YES
+
 # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all
 # undocumented members inside documented classes or files. If set to NO these
 # members will be included in the various overviews, but no documentation
@@ -478,8 +550,8 @@ HIDE_UNDOC_MEMBERS     = NO
 HIDE_UNDOC_CLASSES     = NO
 
 # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend
-# (class|struct|union) declarations. If set to NO, these declarations will be
-# included in the documentation.
+# declarations. If set to NO, these declarations will be included in the
+# documentation.
 # The default value is: NO.
 
 HIDE_FRIEND_COMPOUNDS  = NO
@@ -498,11 +570,18 @@ HIDE_IN_BODY_DOCS      = NO
 
 INTERNAL_DOCS          = NO
 
-# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file
-# names in lower-case letters. If set to YES, upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
+# With the correct setting of option CASE_SENSE_NAMES doxygen will better be
+# able to match the capabilities of the underlying filesystem. In case the
+# filesystem is case sensitive (i.e. it supports files in the same directory
+# whose names only differ in casing), the option must be set to YES to properly
+# deal with such files in case they appear in the input. For filesystems that
+# are not case sensitive the option should be be set to NO to properly deal with
+# output files written for symbols that only differ in casing, such as for two
+# classes, one named CLASS and the other named Class, and to also support
+# references to files without having to specify the exact matching casing. On
+# Windows (including Cygwin) and MacOS, users should typically set this option
+# to NO, whereas on Linux or other Unix flavors it should typically be set to
+# YES.
 # The default value is: system dependent.
 
 CASE_SENSE_NAMES       = NO
@@ -684,12 +763,12 @@ FILE_VERSION_FILTER    =
 # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE
 # tag is left empty.
 
-LAYOUT_FILE            =
+LAYOUT_FILE            = DOCS/DoxygenLayout.xml
 
 # The CITE_BIB_FILES tag can be used to specify one or more bib files containing
 # the reference definitions. This must be a list of .bib files. The .bib
 # extension is automatically appended if omitted. This requires the bibtex tool
-# to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info.
+# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info.
 # For LaTeX the style of the bibliography can be controlled using
 # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the
 # search path. See also \cite for info how to create references.
@@ -705,7 +784,7 @@ CITE_BIB_FILES         =
 # messages are off.
 # The default value is: NO.
 
-QUIET                  = YES
+QUIET                  = NO
 
 # The WARNINGS tag can be used to turn on/off the warning messages that are
 # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES
@@ -734,10 +813,20 @@ WARN_IF_DOC_ERROR      = YES
 # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that
 # are documented, but have no documentation for their parameters or return
 # value. If set to NO, doxygen will only warn about wrong or incomplete
-# parameter documentation, but not about the absence of documentation.
+# parameter documentation, but not about the absence of documentation. If
+# EXTRACT_ALL is set to YES then this flag will automatically be disabled.
 # The default value is: NO.
 
-WARN_NO_PARAMDOC       = NO
+WARN_NO_PARAMDOC       = YES
+
+# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when
+# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS
+# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but
+# at the end of the doxygen process doxygen will return with a non-zero status.
+# Possible values are: NO, YES and FAIL_ON_WARNINGS.
+# The default value is: NO.
+
+WARN_AS_ERROR          = NO
 
 # The WARN_FORMAT tag determines the format of the warning messages that doxygen
 # can produce. The string should contain the $file, $line, and $text tags, which
@@ -753,7 +842,7 @@ WARN_FORMAT            = "$file:$line: $text"
 # messages should be written. If left blank the output is written to standard
 # error (stderr).
 
-WARN_LOGFILE           = output_err
+WARN_LOGFILE           = doxygen_error
 
 #---------------------------------------------------------------------------
 # Configuration options related to the input files
@@ -762,17 +851,18 @@ WARN_LOGFILE           = output_err
 # The INPUT tag is used to specify the files and/or directories that contain
 # documented source files. You may enter file names like myfile.cpp or
 # directories like /usr/src/myproject. Separate the files or directories with
-# spaces.
+# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING
 # Note: If this tag is empty the current directory is searched.
 
-INPUT                  = . \
-                         DOCS/groups-usr.dox
+INPUT                  = BLAS CBLAS SRC INSTALL TESTING \
+                         DOCS/groups-usr.dox \
+                         README.md
 
 # This tag can be used to specify the character encoding of the source files
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses
 # libiconv (or the iconv built into libc) for the transcoding. See the libiconv
-# documentation (see: http://www.gnu.org/software/libiconv) for the list of
-# possible encodings.
+# documentation (see:
+# https://www.gnu.org/software/libiconv/) for the list of possible encodings.
 # The default value is: UTF-8.
 
 INPUT_ENCODING         = UTF-8
@@ -785,14 +875,19 @@ INPUT_ENCODING         = UTF-8
 # need to set EXTENSION_MAPPING for the extension otherwise the files are not
 # read by doxygen.
 #
+# Note the list of default checked file patterns might differ from the list of
+# default file extension mappings.
+#
 # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp,
 # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h,
 # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc,
-# *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd,
-# *.vhdl, *.ucf, *.qsf, *.as and *.js.
+# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment),
+# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl,
+# *.ucf, *.qsf and *.ice.
 
 FILE_PATTERNS          = *.c \
                          *.f \
+                         *.f90 \
                          *.h
 
 # The RECURSIVE tag can be used to specify whether or not subdirectories should
@@ -808,34 +903,15 @@ RECURSIVE              = YES
 # Note that relative paths are relative to the directory from which doxygen is
 # run.
 
-EXCLUDE                = CMAKE \
-                         DOCS \
-                         .svn \
-                         CBLAS/.svn \
-                         CBLAS/src/.svn \
-                         CBLAS/testing/.svn \
-                         CBLAS/example/.svn \
-                         CBLAS/include/.svn \
-                         BLAS/.svn \
-                         BLAS/SRC/.svn \
-                         BLAS/TESTING/.svn \
-                         SRC/.svn \
-                         SRC/VARIANTS/.svn \
-                         SRC/VARIANTS/LIB/.svn \
-                         SRC/VARIANTS/cholesky/.svn \
-                         SRC/VARIANTS/cholesky/RL/.svn \
-                         SRC/VARIANTS/cholesky/TOP/.svn \
-                         SRC/VARIANTS/lu/.svn \
-                         SRC/VARIANTS/lu/CR/.svn \
-                         SRC/VARIANTS/lu/LL/.svn \
-                         SRC/VARIANTS/lu/REC/.svn \
-                         SRC/VARIANTS/qr/.svn \
-                         SRC/VARIANTS/qr/LL/.svn \
-                         INSTALL/.svn \
-                         TESTING/.svn \
-                         TESTING/EIG/.svn \
-                         TESTING/MATGEN/.svn \
-                         TESTING/LIN/.svn
+# Exclude functions that are duplicated, creating conflicts.
+EXCLUDE                = .git \
+                         .github \
+                         SRC/VARIANTS \
+                         BLAS/SRC/lsame.f \
+                         BLAS/SRC/xerbla.f \
+                         BLAS/SRC/xerbla_array.f \
+                         INSTALL/slamchf77.f \
+                         INSTALL/dlamchf77.f \
 
 # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
 # directories that are symbolic links (a Unix file system feature) are excluded
@@ -908,6 +984,10 @@ IMAGE_PATH             =
 # Note that the filter must not add or remove lines; it is applied before the
 # code is scanned, but not when the output code is generated. If lines are added
 # or removed, the anchors will not be placed correctly.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 INPUT_FILTER           =
 
@@ -917,6 +997,10 @@ INPUT_FILTER           =
 # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how
 # filters are used. If the FILTER_PATTERNS tag is empty or if none of the
 # patterns match the file name, INPUT_FILTER is applied.
+#
+# Note that for custom extensions or not directly supported extensions you also
+# need to set EXTENSION_MAPPING for the extension otherwise the files are not
+# properly processed by doxygen.
 
 FILTER_PATTERNS        =
 
@@ -969,7 +1053,7 @@ INLINE_SOURCES         = YES
 STRIP_CODE_COMMENTS    = YES
 
 # If the REFERENCED_BY_RELATION tag is set to YES then for each documented
-# function all documented functions referencing it will be listed.
+# entity all documented functions referencing it will be listed.
 # The default value is: NO.
 
 REFERENCED_BY_RELATION = NO
@@ -1001,12 +1085,12 @@ SOURCE_TOOLTIPS        = YES
 # If the USE_HTAGS tag is set to YES then the references to source code will
 # point to the HTML generated by the htags(1) tool instead of doxygen built-in
 # source browser. The htags tool is part of GNU's global source tagging system
-# (see http://www.gnu.org/software/global/global.html). You will need version
+# (see https://www.gnu.org/software/global/global.html). You will need version
 # 4.8.6 or higher.
 #
 # To use it do the following:
 # - Install the latest version of global
-# - Enable SOURCE_BROWSER and USE_HTAGS in the config file
+# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file
 # - Make sure the INPUT points to the root of the source tree
 # - Run doxygen as normal
 #
@@ -1028,25 +1112,6 @@ USE_HTAGS              = NO
 
 VERBATIM_HEADERS       = YES
 
-# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the
-# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the
-# cost of reduced performance. This can be particularly helpful with template
-# rich C++ code for which doxygen's built-in parser lacks the necessary type
-# information.
-# Note: The availability of this option depends on whether or not doxygen was
-# compiled with the --with-libclang option.
-# The default value is: NO.
-
-CLANG_ASSISTED_PARSING = NO
-
-# If clang assisted parsing is enabled you can provide the compiler with command
-# line options that you would normally use when invoking the compiler. Note that
-# the include paths will already be set by doxygen for the files and directories
-# specified with INPUT and INCLUDE_PATH.
-# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES.
-
-CLANG_OPTIONS          =
-
 #---------------------------------------------------------------------------
 # Configuration options related to the alphabetical class index
 #---------------------------------------------------------------------------
@@ -1058,13 +1123,6 @@ CLANG_OPTIONS          =
 
 ALPHABETICAL_INDEX     = YES
 
-# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in
-# which the alphabetical index list will be split.
-# Minimum value: 1, maximum value: 20, default value: 5.
-# This tag requires that the tag ALPHABETICAL_INDEX is set to YES.
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all classes will
 # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag
 # can be used to specify a prefix (or a list of prefixes) that should be ignored
@@ -1165,7 +1223,7 @@ HTML_EXTRA_FILES       =
 # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen
 # will adjust the colors in the style sheet and background images according to
 # this color. Hue is specified as an angle on a colorwheel, see
-# http://en.wikipedia.org/wiki/Hue for more information. For instance the value
+# https://en.wikipedia.org/wiki/Hue for more information. For instance the value
 # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300
 # purple, and 360 is red again.
 # Minimum value: 0, maximum value: 359, default value: 220.
@@ -1201,6 +1259,17 @@ HTML_COLORSTYLE_GAMMA  = 80
 
 HTML_TIMESTAMP         = YES
 
+# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML
+# documentation will contain a main index with vertical navigation menus that
+# are dynamically created via JavaScript. If disabled, the navigation index will
+# consists of multiple levels of tabs that are statically embedded in every HTML
+# page. Disable this option to support browsers that do not have JavaScript,
+# like the Qt help browser.
+# The default value is: YES.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_DYNAMIC_MENUS     = YES
+
 # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
 # documentation will contain sections that can be hidden and shown after the
 # page has loaded.
@@ -1224,13 +1293,14 @@ HTML_INDEX_NUM_ENTRIES = 100
 
 # If the GENERATE_DOCSET tag is set to YES, additional index files will be
 # generated that can be used as input for Apple's Xcode 3 integrated development
-# environment (see: http://developer.apple.com/tools/xcode/), introduced with
-# OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a
-# Makefile in the HTML output directory. Running make will produce the docset in
-# that directory and running make install will install the docset in
+# environment (see:
+# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To
+# create a documentation set, doxygen will generate a Makefile in the HTML
+# output directory. Running make will produce the docset in that directory and
+# running make install will install the docset in
 # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at
-# startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
+# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy
+# genXcode/_index.html for more information.
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTML is set to YES.
 
@@ -1269,8 +1339,8 @@ DOCSET_PUBLISHER_NAME  = Publisher
 # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three
 # additional HTML index files: index.hhp, index.hhc, and index.hhk. The
 # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop
-# (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on
-# Windows.
+# (see:
+# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows.
 #
 # The HTML Help Workshop contains a compiler that can convert all HTML output
 # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML
@@ -1300,7 +1370,7 @@ CHM_FILE               =
 HHC_LOCATION           =
 
 # The GENERATE_CHI flag controls if a separate .chi index file is generated
-# (YES) or that it should be included in the master .chm file (NO).
+# (YES) or that it should be included in the main .chm file (NO).
 # The default value is: NO.
 # This tag requires that the tag GENERATE_HTMLHELP is set to YES.
 
@@ -1345,7 +1415,8 @@ QCH_FILE               =
 
 # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help
 # Project output. For more information please see Qt Help Project / Namespace
-# (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace).
+# (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace).
 # The default value is: org.doxygen.Project.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1353,8 +1424,8 @@ QHP_NAMESPACE          = org.doxygen.Project
 
 # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt
 # Help Project output. For more information please see Qt Help Project / Virtual
-# Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual-
-# folders).
+# Folders (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders).
 # The default value is: doc.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
@@ -1362,30 +1433,30 @@ QHP_VIRTUAL_FOLDER     = doc
 
 # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom
 # filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_NAME   =
 
 # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the
 # custom filter to add. For more information please see Qt Help Project / Custom
-# Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom-
-# filters).
+# Filters (see:
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_CUST_FILTER_ATTRS  =
 
 # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
 # project's filter section matches. Qt Help Project / Filter Attributes (see:
-# http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes).
+# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes).
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHP_SECT_FILTER_ATTRS  =
 
-# The QHG_LOCATION tag can be used to specify the location of Qt's
-# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the
-# generated .qhp file.
+# The QHG_LOCATION tag can be used to specify the location (absolute path
+# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to
+# run qhelpgenerator on the generated .qhp file.
 # This tag requires that the tag GENERATE_QHP is set to YES.
 
 QHG_LOCATION           =
@@ -1462,6 +1533,17 @@ TREEVIEW_WIDTH         = 250
 
 EXT_LINKS_IN_WINDOW    = NO
 
+# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg
+# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see
+# https://inkscape.org) to generate formulas as SVG images instead of PNGs for
+# the HTML output. These images will generally look nicer at scaled resolutions.
+# Possible values are: png (the default) and svg (looks nicer but requires the
+# pdf2svg or inkscape tool).
+# The default value is: png.
+# This tag requires that the tag GENERATE_HTML is set to YES.
+
+HTML_FORMULA_FORMAT    = png
+
 # Use this tag to change the font size of LaTeX formulas included as images in
 # the HTML documentation. When you change the font size after a successful
 # doxygen run you need to manually remove any form_*.png images from the HTML
@@ -1471,7 +1553,7 @@ EXT_LINKS_IN_WINDOW    = NO
 
 FORMULA_FONTSIZE       = 10
 
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# Use the FORMULA_TRANSPARENT tag to determine whether or not the images
 # generated for formulas are transparent PNGs. Transparent PNGs are not
 # supported properly for IE 6.0, but are supported on all modern browsers.
 #
@@ -1482,8 +1564,14 @@ FORMULA_FONTSIZE       = 10
 
 FORMULA_TRANSPARENT    = YES
 
+# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands
+# to create new LaTeX commands to be used in formulas as building blocks. See
+# the section "Including formulas" for details.
+
+FORMULA_MACROFILE      =
+
 # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see
-# http://www.mathjax.org) which uses client side Javascript for the rendering
+# https://www.mathjax.org) which uses client side JavaScript for the rendering
 # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX
 # installed or if you want to formulas look prettier in the HTML output. When
 # enabled you may also need to install MathJax separately and configure the path
@@ -1495,7 +1583,7 @@ USE_MATHJAX            = NO
 
 # When MathJax is enabled you can set the default output format to be used for
 # the MathJax output. See the MathJax site (see:
-# http://docs.mathjax.org/en/latest/output.html) for more details.
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details.
 # Possible values are: HTML-CSS (which is slower, but has the best
 # compatibility), NativeMML (i.e. MathML) and SVG.
 # The default value is: HTML-CSS.
@@ -1510,8 +1598,8 @@ MATHJAX_FORMAT         = HTML-CSS
 # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax
 # Content Delivery Network so you can quickly see the result without installing
 # MathJax. However, it is strongly recommended to install a local copy of
-# MathJax from http://www.mathjax.org before deployment.
-# The default value is: http://cdn.mathjax.org/mathjax/latest.
+# MathJax from https://www.mathjax.org before deployment.
+# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
 MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
@@ -1525,7 +1613,8 @@ MATHJAX_EXTENSIONS     =
 
 # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces
 # of code that will be used on startup of the MathJax code. See the MathJax site
-# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an
+# (see:
+# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an
 # example see the documentation.
 # This tag requires that the tag USE_MATHJAX is set to YES.
 
@@ -1553,7 +1642,7 @@ MATHJAX_CODEFILE       =
 SEARCHENGINE           = YES
 
 # When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a web server instead of a web client using Javascript. There
+# implemented using a web server instead of a web client using JavaScript. There
 # are two flavors of web server based searching depending on the EXTERNAL_SEARCH
 # setting. When disabled, doxygen will generate a PHP script for searching and
 # an index file used by the script. When EXTERNAL_SEARCH is enabled the indexing
@@ -1572,7 +1661,8 @@ SERVER_BASED_SEARCH    = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/).
+# Xapian (see:
+# https://xapian.org/).
 #
 # See the section "External Indexing and Searching" for details.
 # The default value is: NO.
@@ -1585,8 +1675,9 @@ EXTERNAL_SEARCH        = NO
 #
 # Doxygen ships with an example indexer (doxyindexer) and search engine
 # (doxysearch.cgi) which are based on the open source search engine library
-# Xapian (see: http://xapian.org/). See the section "External Indexing and
-# Searching" for details.
+# Xapian (see:
+# https://xapian.org/). See the section "External Indexing and Searching" for
+# details.
 # This tag requires that the tag SEARCHENGINE is set to YES.
 
 SEARCHENGINE_URL       =
@@ -1637,21 +1728,35 @@ LATEX_OUTPUT           = latex
 # The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
 # invoked.
 #
-# Note that when enabling USE_PDFLATEX this option is only used for generating
-# bitmaps for formulas in the HTML output, but not in the Makefile that is
-# written to the output directory.
-# The default file is: latex.
+# Note that when not enabling USE_PDFLATEX the default is latex when enabling
+# USE_PDFLATEX the default is pdflatex and when in the later case latex is
+# chosen this is overwritten by pdflatex. For specific output languages the
+# default can have been set differently, this depends on the implementation of
+# the output language.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_CMD_NAME         = latex
 
 # The MAKEINDEX_CMD_NAME tag can be used to specify the command name to generate
 # index for LaTeX.
+# Note: This tag is used in the Makefile / make.bat.
+# See also: LATEX_MAKEINDEX_CMD for the part in the generated output file
+# (.tex).
 # The default file is: makeindex.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 MAKEINDEX_CMD_NAME     = makeindex
 
+# The LATEX_MAKEINDEX_CMD tag can be used to specify the command name to
+# generate index for LaTeX. In case there is no backslash (\) as first character
+# it will be automatically added in the LaTeX code.
+# Note: This tag is used in the generated output file (.tex).
+# See also: MAKEINDEX_CMD_NAME for the part in the Makefile / make.bat.
+# The default value is: makeindex.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_MAKEINDEX_CMD    = makeindex
+
 # If the COMPACT_LATEX tag is set to YES, doxygen generates more compact LaTeX
 # documents. This may be useful for small projects and may help to save some
 # trees in general.
@@ -1736,9 +1841,11 @@ LATEX_EXTRA_FILES      =
 
 PDF_HYPERLINKS         = YES
 
-# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate
-# the PDF file directly from the LaTeX files. Set this option to YES, to get a
-# higher quality PDF documentation.
+# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as
+# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX
+# files. Set this option to YES, to get a higher quality PDF documentation.
+#
+# See also section LATEX_CMD_NAME for selecting the engine.
 # The default value is: YES.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
@@ -1772,12 +1879,28 @@ LATEX_SOURCE_CODE      = NO
 
 # The LATEX_BIB_STYLE tag can be used to specify the style to use for the
 # bibliography, e.g. plainnat, or ieeetr. See
-# http://en.wikipedia.org/wiki/BibTeX and \cite for more info.
+# https://en.wikipedia.org/wiki/BibTeX and \cite for more info.
 # The default value is: plain.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 
 LATEX_BIB_STYLE        = plain
 
+# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated
+# page will contain the date and time when the page was generated. Setting this
+# to NO can help when comparing the output of multiple runs.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_TIMESTAMP        = NO
+
+# The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute)
+# path from which the emoji images will be read. If a relative path is entered,
+# it will be relative to the LATEX_OUTPUT directory. If left blank the
+# LATEX_OUTPUT directory will be used.
+# This tag requires that the tag GENERATE_LATEX is set to YES.
+
+LATEX_EMOJI_DIRECTORY  =
+
 #---------------------------------------------------------------------------
 # Configuration options related to the RTF output
 #---------------------------------------------------------------------------
@@ -1817,9 +1940,9 @@ COMPACT_RTF            = NO
 
 RTF_HYPERLINKS         = YES
 
-# Load stylesheet definitions from file. Syntax is similar to doxygen's config
-# file, i.e. a series of assignments. You only have to provide replacements,
-# missing definitions are set to their default value.
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# configuration file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
 #
 # See also section "Doxygen usage" for information on how to generate the
 # default style sheet that doxygen normally uses.
@@ -1828,8 +1951,8 @@ RTF_HYPERLINKS         = YES
 RTF_STYLESHEET_FILE    =
 
 # Set optional variables used in the generation of an RTF document. Syntax is
-# similar to doxygen's config file. A template extensions file can be generated
-# using doxygen -e rtf extensionFile.
+# similar to doxygen's configuration file. A template extensions file can be
+# generated using doxygen -e rtf extensionFile.
 # This tag requires that the tag GENERATE_RTF is set to YES.
 
 RTF_EXTENSIONS_FILE    =
@@ -1915,6 +2038,13 @@ XML_OUTPUT             = xml
 
 XML_PROGRAMLISTING     = YES
 
+# If the XML_NS_MEMB_FILE_SCOPE tag is set to YES, doxygen will include
+# namespace members in file scope as well, matching the HTML output.
+# The default value is: NO.
+# This tag requires that the tag GENERATE_XML is set to YES.
+
+XML_NS_MEMB_FILE_SCOPE = NO
+
 #---------------------------------------------------------------------------
 # Configuration options related to the DOCBOOK output
 #---------------------------------------------------------------------------
@@ -1947,9 +2077,9 @@ DOCBOOK_PROGRAMLISTING = NO
 #---------------------------------------------------------------------------
 
 # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an
-# AutoGen Definitions (see http://autogen.sf.net) file that captures the
-# structure of the code including all documentation. Note that this feature is
-# still experimental and incomplete at the moment.
+# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures
+# the structure of the code including all documentation. Note that this feature
+# is still experimental and incomplete at the moment.
 # The default value is: NO.
 
 GENERATE_AUTOGEN_DEF   = NO
@@ -2116,12 +2246,6 @@ EXTERNAL_GROUPS        = YES
 
 EXTERNAL_PAGES         = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of 'which perl').
-# The default file (with absolute path) is: /usr/bin/perl.
-
-PERL_PATH              = /sw/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
@@ -2135,15 +2259,6 @@ PERL_PATH              = /sw/bin/perl
 
 CLASS_DIAGRAMS         = YES
 
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see:
-# http://www.mcternan.me.uk/mscgen/)) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
 # You can include diagrams made with dia in doxygen documentation. Doxygen will
 # then run dia to produce the diagram and insert it in the documentation. The
 # DIA_PATH tag allows you to specify the directory where the dia binary resides.
@@ -2241,10 +2356,32 @@ UML_LOOK               = NO
 # but if the number exceeds 15, the total amount of fields shown is limited to
 # 10.
 # Minimum value: 0, maximum value: 100, default value: 10.
-# This tag requires that the tag HAVE_DOT is set to YES.
+# This tag requires that the tag UML_LOOK is set to YES.
 
 UML_LIMIT_NUM_FIELDS   = 10
 
+# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and
+# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS
+# tag is set to YES, doxygen will add type and arguments for attributes and
+# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen
+# will not generate fields with class member information in the UML graphs. The
+# class diagrams will look similar to the default class diagrams but using UML
+# notation for the relationships.
+# Possible values are: NO, YES and NONE.
+# The default value is: NO.
+# This tag requires that the tag UML_LOOK is set to YES.
+
+DOT_UML_DETAILS        = NO
+
+# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters
+# to display on a single line. If the actual line length exceeds this threshold
+# significantly it will wrapped across multiple lines. Some heuristics are apply
+# to avoid ugly line breaks.
+# Minimum value: 0, maximum value: 1000, default value: 17.
+# This tag requires that the tag HAVE_DOT is set to YES.
+
+DOT_WRAP_THRESHOLD     = 17
+
 # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and
 # collaboration graphs will show the relations between templates and their
 # instances.
@@ -2371,6 +2508,11 @@ DIAFILE_DIRS           =
 
 PLANTUML_JAR_PATH      =
 
+# When using plantuml, the PLANTUML_CFG_FILE tag can be used to specify a
+# configuration file for plantuml.
+
+PLANTUML_CFG_FILE      =
+
 # When using plantuml, the specified paths are searched for files specified by
 # the !include statement in a plantuml block.
 
@@ -2429,9 +2571,11 @@ DOT_MULTI_TARGETS      = NO
 
 GENERATE_LEGEND        = YES
 
-# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot
+# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate
 # files that are used to generate the various graphs.
+#
+# Note: This setting is not only used for dot files but also for msc and
+# plantuml temporary files.
 # The default value is: YES.
-# This tag requires that the tag HAVE_DOT is set to YES.
 
 DOT_CLEANUP            = YES
diff --git a/lapack-netlib/DOCS/DoxygenLayout.xml b/lapack-netlib/DOCS/DoxygenLayout.xml
new file mode 100644
index 000000000..aeb346d8d
--- /dev/null
+++ b/lapack-netlib/DOCS/DoxygenLayout.xml
@@ -0,0 +1,197 @@
+<doxygenlayout version="1.0">
+  <!-- Generated by doxygen 1.8.13 -->
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="mainpage" visible="yes" title=""/>
+    <tab type="pages" visible="yes" title="" intro=""/>
+
+    <!-- LAPACK: change "Modules" to "Routines" -->
+    <tab type="modules" visible="yes" title="Routines" intro=""/>
+
+    <tab type="namespaces" visible="yes" title="">
+      <tab type="namespacelist" visible="yes" title="" intro=""/>
+      <tab type="namespacemembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="classes" visible="yes" title="">
+      <tab type="classlist" visible="yes" title="" intro=""/>
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
+      <tab type="hierarchy" visible="yes" title="" intro=""/>
+      <tab type="classmembers" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="files" visible="yes" title="">
+      <tab type="filelist" visible="yes" title="" intro=""/>
+      <tab type="globals" visible="yes" title="" intro=""/>
+    </tab>
+    <tab type="examples" visible="yes" title="" intro=""/>
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <services title=""/>
+      <interfaces title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <allmemberslink visible="yes"/>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <constantgroups visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="yes"/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/lapack-netlib/DOCS/groups-usr.dox b/lapack-netlib/DOCS/groups-usr.dox
index 6c31e4cf5..cbd747165 100644
--- a/lapack-netlib/DOCS/groups-usr.dox
+++ b/lapack-netlib/DOCS/groups-usr.dox
@@ -1,605 +1,974 @@
 /**
- *
- ****
- *
- * @defgroup lapack LAPACK
- *   This is the group of LAPACK routines.
- *
- * @defgroup GE  General Matrices
- *   @ingroup lapack
- *   This is the group of General Matrices routines
- * @defgroup GB  General Band Matrix
- *   @ingroup lapack
- *   This is the group of General Band routines
- * @defgroup SY  Symmetric Matrix
- *   @ingroup lapack
- *   This is the group of Symmetric Matrix routines
- * @defgroup HE  Hermitian Matrix
- *   @ingroup lapack
- *   This is the group of Hermitian Matrix routines
- * @defgroup PO  Positive Definite Matrix
- *   @ingroup lapack
- *   This is the group of Positive Definite routines
- * @defgroup GT  General tridiagonal Matrix
- *   @ingroup lapack
- *   This is the group of General tridiagonal routines
- * @defgroup PT  Positive Definite tridiagonal Matrix
- *   @ingroup lapack
- *   This is the group of Positive Definite tridiagonal routines
- * @defgroup OTHEReigen  Eigenvalue
- *   @ingroup lapack
- *   This is the group of Other Eigenvalue routines
- * @defgroup OTHERauxiliary  Other Auxiliary Routines
- *   @ingroup lapack
- *   This is the group of Other Auxiliary routines
- * @defgroup OTHERcomputational  Other Computational Routines
- *   @ingroup lapack
- *   This is the group of Other Computational routines
- * @defgroup OTHERsolve  Other Solve Routines
- *   @ingroup lapack
- *   This is the group of Other Solve routines
- *
- ****
- *
- * @defgroup solveGE  Linear Solve
- *   @ingroup GE
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveGB  Linear Solve
- *   @ingroup GB
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveSY  Linear Solve
- *   @ingroup SY
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveHE  Linear Solve
- *   @ingroup HE
- *   This is the group of Linear Solve Driver routines
- * @defgroup solvePO  Linear Solve
- *   @ingroup PO
- *   This is the group of Linear Solve Driver routines
- * @defgroup solveGT  Linear Solve
- *   @ingroup GT
- *   This is the group of Linear Solve Driver routines
- * @defgroup solvePT  Linear Solve
- *   @ingroup PT
- *   This is the group of Linear Solve Driver routines
- * @defgroup eigenGE  Eigenvalue
- *   @ingroup GE
- *   This is the group of Eigenvalue Driver routines
- * @defgroup eigenSY  Eigenvalue
- *   @ingroup SY
- *   This is the group of Eigenvalue Driver routines
- * @defgroup eigenHE  Eigenvalue
- *   @ingroup HE
- *   This is the group of Eigenvalue Driver routines
- * @defgroup singGE  Singular Value
- *   @ingroup GE
- *   This is the group of Singular Value Driver routines
- * @defgroup computationalGE  Computational routines
- *   @ingroup GE
- *   This is the group of Computational routines
- * @defgroup variantsGEcomputational  Variants Computational routines
- *   @ingroup GE
- *   This is the group of Variants Computational routines
- * @defgroup computationalGB  Computational routines
- *   @ingroup GB
- *   This is the group of Computational routines
- * @defgroup computationalSY  Computational routines
- *   @ingroup SY
- *   This is the group of Computational routines
- * @defgroup computationalHE  Computational routines
- *   @ingroup HE
- *   This is the group of Computational routines
- * @defgroup computationalPO  Computational routines
- *   @ingroup PO
- *   This is the group of Computational routines
- * @defgroup variantsPOcomputational  Variants Computational routines
- *   @ingroup PO
- *   This is the group of Variants Computational routines
- * @defgroup computationalGT  Computational routines
- *   @ingroup GT
- *   This is the group of Computational routines
- * @defgroup computationalPT  Computational routines
- *   @ingroup PT
- *   This is the group of Computational routines
- * @defgroup variantsOTHERcomputational  Variants Computational routines
- *   @ingroup OTHERcomputational
- *   This is the group of Variants Computational routines
- * @defgroup auxiliaryGE  Auxiliary routines
- *   @ingroup GE
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryGB  Auxiliary routines
- *   @ingroup GB
- *   This is the group of Auxiliary routines
- * @defgroup auxiliarySY  Auxiliary routines
- *   @ingroup SY
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryHE  Auxiliary routines
- *   @ingroup HE
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryPO  Auxiliary routines
- *   @ingroup PO
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryGT  Auxiliary routines
- *   @ingroup GT
- *   This is the group of Auxiliary routines
- * @defgroup auxiliaryPT  Auxiliary routines
- *   @ingroup PT
- *   This is the group of Auxiliary routines
- *
- ****
- *
- * @defgroup doubleGEsolve double
- *   @ingroup solveGE
- *   This is the group of double solve driver functions for GE matrices
- * @defgroup doublePOsolve double
- *   @ingroup solvePO
- *   This is the group of double solve driver functions for PO matrices
- * @defgroup doubleSYsolve double
- *   @ingroup solveSY
- *   This is the group of double solve driver functions for SY matrices
- * @defgroup doubleGBsolve double
- *   @ingroup solveGB
- *   This is the group of double solve driver functions for GB matrices
- * @defgroup doubleGTsolve double
- *   @ingroup solveGT
- *   This is the group of double solve driver functions for GT matrices
- * @defgroup doublePTsolve double
- *   @ingroup solvePT
- *   This is the group of double solve driver functions for PT matrices
- * @defgroup doubleGEeigen double
- *   @ingroup eigenGE
- *   This is the group of double eigenvalue driver functions for GE matrices
- * @defgroup doubleSYeigen double
- *   @ingroup eigenSY
- *   This is the group of double eigenvalue driver functions for SY matrices
- * @defgroup doubleGEsing double
- *   @ingroup singGE
- *   This is the group of double singular value driver functions for GE matrices
- * @defgroup doubleGEcomputational double
- *   @ingroup computationalGE
- *   This is the group of double computational functions for GE matrices
- * @defgroup doublePOcomputational double
- *   @ingroup computationalPO
- *   This is the group of double computational functions for PO matrices
- * @defgroup doubleSYcomputational double
- *   @ingroup computationalSY
- *   This is the group of double computational functions for SY matrices
- * @defgroup doubleGBcomputational double
- *   @ingroup computationalGB
- *   This is the group of double computational functions for GB matrices
- * @defgroup doubleGTcomputational double
- *   @ingroup computationalGT
- *   This is the group of double computational functions for GT matrices
- * @defgroup doublePTcomputational double
- *   @ingroup computationalPT
- *   This is the group of double computational functions for PT matrices
- * @defgroup doubleGEauxiliary double
- *   @ingroup auxiliaryGE
- *   This is the group of double auxiliary functions for GE matrices
- * @defgroup doublePOauxiliary double
- *   @ingroup auxiliaryPO
- *   This is the group of double auxiliary functions for PO matrices
- * @defgroup doubleSYauxiliary double
- *   @ingroup auxiliarySY
- *   This is the group of double auxiliary functions for SY matrices
- * @defgroup doubleGBauxiliary double
- *   @ingroup auxiliaryGB
- *   This is the group of double auxiliary functions for GB matrices
- * @defgroup doublePTauxiliary double
- *   @ingroup auxiliaryPT
- *   This is the group of double auxiliary functions for PT matrices
- * @defgroup doubleGTauxiliary double
- *   @ingroup auxiliaryGT
- *   This is the group of double auxiliary functions for GT matrices
- * @defgroup doubleOTHERauxiliary  double
- *   @ingroup OTHERauxiliary
- *   This is the group of double other auxiliary routines
- * @defgroup doubleOTHERcomputational  double
- *   @ingroup OTHERcomputational
- *   This is the group of double other Computational routines
- * @defgroup doubleOTHERsolve  double Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of double Other Solve routines
- * @defgroup doubleOTHEReigen  double
- *   @ingroup OTHEReigen
- *   This is the group of double Other Eigenvalue routines
- *
- ****
- *
- * @defgroup realGEsolve real
- *   @ingroup solveGE
- *   This is the group of real solve driver functions for GE matrices
- * @defgroup realPOsolve real
- *   @ingroup solvePO
- *   This is the group of real solve driver functions for PO matrices
- * @defgroup realSYsolve real
- *   @ingroup solveSY
- *   This is the group of real solve driver functions for SY matrices
- * @defgroup realGBsolve real
- *   @ingroup solveGB
- *   This is the group of real solve driver functions for GB matrices
- * @defgroup realGTsolve real
- *   @ingroup solveGT
- *   This is the group of real solve driver functions for GT matrices
- * @defgroup realPTsolve real
- *   @ingroup solvePT
- *   This is the group of real solve driver functions for PT matrices
- * @defgroup realGEeigen real
- *   @ingroup eigenGE
- *   This is the group of real eigenvalue driver functions for GE matrices
- * @defgroup realSYeigen real
- *   @ingroup eigenSY
- *   This is the group of real eigenvalue driver functions for SY matrices
- * @defgroup realGEsing real
- *   @ingroup singGE
- *   This is the group of real singular value driver functions for GE matrices
- * @defgroup realGEcomputational real
- *   @ingroup computationalGE
- *   This is the group of real computational functions for GE matrices
- * @defgroup realPOcomputational real
- *   @ingroup computationalPO
- *   This is the group of real computational functions for PO matrices
- * @defgroup realSYcomputational real
- *   @ingroup computationalSY
- *   This is the group of real computational functions for SY matrices
- * @defgroup realGBcomputational real
- *   @ingroup computationalGB
- *   This is the group of real computational functions for GB matrices
- * @defgroup realPTcomputational real
- *   @ingroup computationalPT
- *   This is the group of real computational functions for PT matrices
- * @defgroup realGTcomputational real
- *   @ingroup computationalGT
- *   This is the group of real computational functions for GT matrices
- * @defgroup realGEauxiliary real
- *   @ingroup auxiliaryGE
- *   This is the group of real auxiliary functions for GE matrices
- * @defgroup realPOauxiliary real
- *   @ingroup auxiliaryPO
- *   This is the group of real auxiliary functions for PO matrices
- * @defgroup realSYauxiliary real
- *   @ingroup auxiliarySY
- *   This is the group of real auxiliary functions for SY matrices
- * @defgroup realGBauxiliary real
- *   @ingroup auxiliaryGB
- *   This is the group of real auxiliary functions for GB matrices
- * @defgroup realGTauxiliary real
- *   @ingroup auxiliaryGT
- *   This is the group of real auxiliary functions for GT matrices
- * @defgroup realPTauxiliary real
- *   @ingroup auxiliaryPT
- *   This is the group of real auxiliary functions for PT matrices
- * @defgroup realOTHERauxiliary  real
- *   @ingroup OTHERauxiliary
- *   This is the group of real other auxiliary routines
- * @defgroup realOTHERcomputational  real
- *   @ingroup OTHERcomputational
- *   This is the group of real other Computational routines
- * @defgroup realOTHERsolve  real Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of real Other Solve routines
- * @defgroup realOTHEReigen  real
- *   @ingroup OTHEReigen
- *   This is the group of real Other Eigenvalue routines
- *
- ****
- *
- * @defgroup complexGEsolve complex
- *   @ingroup solveGE
- *   This is the group of complex solve driver functions for GE matrices
- * @defgroup complexPOsolve complex
- *   @ingroup solvePO
- *   This is the group of complex solve driver functions for PO matrices
- * @defgroup complexSYsolve complex
- *   @ingroup solveSY
- *   This is the group of complex solve driver functions for SY matrices
- * @defgroup complexHEsolve complex
- *   @ingroup solveHE
- *   This is the group of complex solve driver functions for HE matrices
- * @defgroup complexGBsolve complex
- *   @ingroup solveGB
- *   This is the group of complex solve driver functions for GB matrices
- * @defgroup complexGTsolve complex
- *   @ingroup solveGT
- *   This is the group of complex solve driver functions for GT matrices
- * @defgroup complexPTsolve complex
- *   @ingroup solvePT
- *   This is the group of complex solve driver functions for PT matrices
- * @defgroup complexGEeigen complex
- *   @ingroup eigenGE
- *   This is the group of complex eigenvalue driver functions for GE matrices
- * @defgroup complexSYeigen complex
- *   @ingroup eigenSY
- *   This is the group of complex eigenvalue driver functions for SY matrices
- * @defgroup complexHEeigen complex
- *   @ingroup eigenHE
- *   This is the group of complex eigenvalue driver functions for HE matrices
- * @defgroup complexGEsing complex
- *   @ingroup singGE
- *   This is the group of complex singular value driver functions for GE matrices
- * @defgroup complexGEcomputational complex
- *   @ingroup computationalGE
- *   This is the group of complex computational functions for GE matrices
- * @defgroup complexPOcomputational complex
- *   @ingroup computationalPO
- *   This is the group of complex computational functions for PO matrices
- * @defgroup complexSYcomputational complex
- *   @ingroup computationalSY
- *   This is the group of complex computational functions for SY matrices
- * @defgroup complexHEcomputational complex
- *   @ingroup computationalHE
- *   This is the group of complex computational functions for HE matrices
- * @defgroup complexGBcomputational complex
- *   @ingroup computationalGB
- *   This is the group of complex computational functions for GB matrices
- * @defgroup complexGTcomputational complex
- *   @ingroup computationalGT
- *   This is the group of complex computational functions for GT matrices
- * @defgroup complexPTcomputational complex
- *   @ingroup computationalPT
- *   This is the group of complex computational functions for PT matrices
- * @defgroup complexGEauxiliary complex
- *   @ingroup auxiliaryGE
- *   This is the group of complex auxiliary functions for GE matrices
- * @defgroup complexPOauxiliary complex
- *   @ingroup auxiliaryPO
- *   This is the group of complex auxiliary functions for PO matrices
- * @defgroup complexSYauxiliary complex
- *   @ingroup auxiliarySY
- *   This is the group of complex auxiliary functions for SY matrices
- * @defgroup complexHEauxiliary complex
- *   @ingroup auxiliaryHE
- *   This is the group of complex auxiliary functions for HE matrices
- * @defgroup complexGBauxiliary complex
- *   @ingroup auxiliaryGB
- *   This is the group of complex auxiliary functions for GB matrices
- * @defgroup complexOTHERauxiliary  complex
- *   @ingroup OTHERauxiliary
- *   This is the group of complex other auxiliary routines
- * @defgroup complexOTHERcomputational  complex
- *   @ingroup OTHERcomputational
- *   This is the group of complex other Computational routines
- * @defgroup complexOTHERsolve  complex Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of complex Other Solve routines
- * @defgroup complexOTHEReigen  complex Other Eigenvalue routines
- *   @ingroup OTHEReigen
- *   This is the group of complex Other Eigenvalue routines
- *
- ****
- *
- * @defgroup complex16GEsolve complex16
- *   @ingroup solveGE
- *   This is the group of complex16 solve driver functions for GE matrices
- * @defgroup complex16POsolve complex16
- *   @ingroup solvePO
- *   This is the group of complex16 solve driver functions for PO matrices
- * @defgroup complex16SYsolve complex16
- *   @ingroup solveSY
- *   This is the group of complex16 solve driver functions for SY matrices
- * @defgroup complex16HEsolve complex16
- *   @ingroup solveHE
- *   This is the group of complex16 solve driver functions for HE matrices
- * @defgroup complex16GBsolve complex16
- *   @ingroup solveGB
- *   This is the group of complex16 solve driver functions for GB matrices
- * @defgroup complex16GTsolve complex16
- *   @ingroup solveGT
- *   This is the group of complex16 solve driver functions for GT matrices
- * @defgroup complex16PTsolve complex16
- *   @ingroup solvePT
- *   This is the group of complex16 solve driver functions for PT matrices
- * @defgroup complex16GEeigen complex16
- *   @ingroup eigenGE
- *   This is the group of complex16 eigenvalue driver functions for GE matrices
- * @defgroup complex16SYeigen complex16
- *   @ingroup eigenSY
- *   This is the group of complex16 eigenvalue driver functions for SY matrices
- * @defgroup complex16HEeigen complex16
- *   @ingroup eigenHE
- *   This is the group of complex16 eigenvalue driver functions for HE matrices
- * @defgroup complex16GEsing complex16
- *   @ingroup singGE
- *   This is the group of complex16 singular value driver functions for GE matrices
- * @defgroup complex16GEcomputational complex16
- *   @ingroup computationalGE
- *   This is the group of complex16 computational functions for GE matrices
- * @defgroup complex16POcomputational complex16
- *   @ingroup computationalPO
- *   This is the group of complex16 computational functions for PO matrices
- * @defgroup complex16SYcomputational complex16
- *   @ingroup computationalSY
- *   This is the group of complex16 computational functions for SY matrices
- * @defgroup complex16HEcomputational complex16
- *   @ingroup computationalHE
- *   This is the group of complex16 computational functions for HE matrices
- * @defgroup complex16GBcomputational complex16
- *   @ingroup computationalGB
- *   This is the group of complex16 computational functions for GB matrices
- * @defgroup complex16GTcomputational complex16
- *   @ingroup computationalGT
- *   This is the group of complex16 computational functions for GT matrices
- * @defgroup complex16PTcomputational complex16
- *   @ingroup computationalPT
- *   This is the group of complex16 computational functions for PT matrices
- * @defgroup complex16GEauxiliary complex16
- *   @ingroup auxiliaryGE
- *   This is the group of complex16 auxiliary functions for GE matrices
- * @defgroup complex16POauxiliary complex16
- *   @ingroup auxiliaryPO
- *   This is the group of complex16 auxiliary functions for PO matrices
- * @defgroup complex16SYauxiliary complex16
- *   @ingroup auxiliarySY
- *   This is the group of complex16 auxiliary functions for SY matrices
- * @defgroup complex16HEauxiliary complex16
- *   @ingroup auxiliaryHE
- *   This is the group of complex16 auxiliary functions for HE matrices
- * @defgroup complex16GBauxiliary complex16
- *   @ingroup auxiliaryGB
- *   This is the group of complex16 auxiliary functions for GB matrices
- * @defgroup complex16OTHERcomputational  complex16
- *   @ingroup OTHERcomputational
- *   This is the group of complex16 other Computational routines
- * @defgroup complex16OTHERauxiliary  complex16
- *   @ingroup OTHERauxiliary
- *   This is the group of complex16 other auxiliary routines
- * @defgroup auxOTHERcomputational  auxiliary Computational routines
- *   @ingroup OTHERcomputational
- *   This is the group of auxiliary Computational routines
- * @defgroup complex16OTHERsolve  complex16 Other Solve Routines
- *   @ingroup OTHERsolve
- *   This is the group of complex16 Other Solve routines
- * @defgroup complex16OTHEReigen  complex16 Other Eigenvalue routines
- *   @ingroup OTHEReigen
- *   This is the group of complex16 Other Eigenvalue routines
- *
- ****
- *
- * @defgroup testing LAPACK Testing
- *   This is the group of LAPACK TESTING routines.
- *
- * @defgroup matgen Matrix Generation
- *   @ingroup testing
- *   This is the group of LAPACK TESTING MATGEN routines.
- *
- * @defgroup lin Linear Solve
- *   @ingroup testing
- *   This is the group of LAPACK TESTING LIN routines.
- *
- * @defgroup eig Eigenvalue and Singular value
- *   @ingroup testing
- *   This is the group of LAPACK TESTING EIG routines.
- *
- * @defgroup real_matgen real
- *   @ingroup matgen
- *   This is the group of real LAPACK TESTING MATGEN routines.
- *
- * @defgroup double_matgen double
- *   @ingroup matgen
- *   This is the group of double LAPACK TESTING MATGEN routines.
- *
- * @defgroup complex_matgen complex
- *   @ingroup matgen
- *   This is the group of complex LAPACK TESTING MATGEN routines.
- *
- * @defgroup complex16_matgen complex16
- *   @ingroup matgen
- *   This is the group of complex16 LAPACK TESTING MATGEN routines.
- *
- * @defgroup aux_matgen aux
- *   @ingroup matgen
- *   This is the group of auxiliary LAPACK TESTING MATGEN routines.
- *
- * @defgroup single_lin real
- *   @ingroup lin
- *   This is the group of real LAPACK TESTING LIN routines.
- *
- * @defgroup double_lin double
- *   @ingroup lin
- *   This is the group of double LAPACK TESTING LIN routines.
- *
- * @defgroup complex_lin complex
- *   @ingroup lin
- *   This is the group of complex LAPACK TESTING LIN routines.
- *
- * @defgroup complex16_lin complex16
- *   @ingroup lin
- *   This is the group of complex16 LAPACK TESTING LIN routines.
- *
- * @defgroup aux_lin aux
- *   @ingroup lin
- *   This is the group of auxiliary LAPACK TESTING LIN routines.
- *
- * @defgroup single_eig real
- *   @ingroup eig
- *   This is the group of real LAPACK TESTING EIG routines.
- *
- * @defgroup double_eig double
- *   @ingroup eig
- *   This is the group of double LAPACK TESTING EIG routines.
- *
- * @defgroup complex_eig complex
- *   @ingroup eig
- *   This is the group of complex LAPACK TESTING EIG routines.
- *
- * @defgroup complex16_eig complex16
- *   @ingroup eig
- *   This is the group of complex16 LAPACK TESTING EIG routines.
- *
- * @defgroup aux_eig aux
- *   @ingroup eig
- *   This is the group of auxiliary LAPACK TESTING EIG routines.
- *
- ****
- * @defgroup blas Reference BLAS
- *   This is the group of BLAS routines.
- *
- * @defgroup level1 Level1
- *   @ingroup blas
- *   This is the group of LEVEL 1 BLAS routines.
- * @defgroup level2 Level2
- *   @ingroup blas
- *   This is the group of LEVEL 2 BLAS routines.
- * @defgroup level3 Level3
- *   @ingroup blas
- *   This is the group of LEVEL 3 BLAS routines.
-  * @defgroup aux_blas Auxiliary BLAS
- *   @ingroup blas
- *   This is the group of Auxiliary 3 BLAS routines.
-* @defgroup blastesting Testing
- *   @ingroup blas
- *   This is the group of BLAS TESTING routines.
- *
- * @defgroup single_blas_level1 real
- *   @ingroup level1
- *   This is the group of real LEVEL 1 BLAS routines.
- * @defgroup double_blas_level1 double
- *   @ingroup level1
- *   This is the group of double LEVEL 1 BLAS routines.
- * @defgroup complex_blas_level1 complex
- *   @ingroup level1
- *   This is the group of complex LEVEL 1 BLAS routines.
- * @defgroup complex16_blas_level1 complex16
- *   @ingroup level1
- *   This is the group of complex16 LEVEL 1 BLAS routines.
- *
- * @defgroup single_blas_level2 real
- *   @ingroup level2
- *   This is the group of real LEVEL 2 BLAS routines.
- * @defgroup double_blas_level2 double
- *   @ingroup level2
- *   This is the group of double LEVEL 2 BLAS routines.
- * @defgroup complex_blas_level2 complex
- *   @ingroup level2
- *   This is the group of complex LEVEL 2 BLAS routines.
- * @defgroup complex16_blas_level2 complex16
- *   @ingroup level2
- *   This is the group of complex16 LEVEL 2 BLAS routines.
- *
- * @defgroup single_blas_level3 real
- *   @ingroup level3
- *   This is the group of real LEVEL 3 BLAS routines.
- * @defgroup double_blas_level3 double
- *   @ingroup level3
- *   This is the group of double LEVEL 3 BLAS routines.
- * @defgroup complex_blas_level3 complex
- *   @ingroup level3
- *   This is the group of complex LEVEL 3 BLAS routines.
- * @defgroup complex16_blas_level3 complex16
- *   @ingroup level3
- *   This is the group of complex16 LEVEL 3 BLAS routines.
- *
- * @defgroup single_blas_testing real
- *   @ingroup blastesting
- *   This is the group of real BLAS TESTING routines.
- * @defgroup double_blas_testing double
- *   @ingroup blastesting
- *   This is the group of double BLAS TESTING routines.
- * @defgroup complex_blas_testing complex
- *   @ingroup blastesting
- *   This is the group of complex BLAS TESTING routines.
- * @defgroup complex16_blas_testing complex16
- *   @ingroup blastesting
- *   This is the group of complex16 BLAS TESTING routines.
- *
+
+@defgroup lapack_top                LAPACK
+@{
+    @defgroup solve_top             Linear solve, AX = B
+    @{
+        @defgroup gesv_driver_grp   LU: General matrix, driver
+        @{
+            @defgroup gesv_driver   --- full ---
+            @defgroup gesv          gesv:           factor and solve
+            @defgroup gesvx         gesvx:          factor and solve, expert
+            @defgroup gesvxx        gesvxx:         factor and solve, extra precise
+            @defgroup gesv_mixed    gesv:           factor and solve, mixed precision
+
+            @defgroup gbsv_driver   --- banded ---
+            @defgroup gbsv          gbsv:           factor and solve
+            @defgroup gbsvx         gbsvx:          factor and solve, expert
+            @defgroup gbsvxx        gbsvxx:         factor and solve, extra precise
+
+            @defgroup gtsv_driver   --- tridiagonal ---
+            @defgroup gtsv          gtsv:           factor and solve
+            @defgroup gtsvx         gtsvx:          factor and solve, expert
+        @}
+
+        @defgroup gesv_comp_grp     LU: computational routines (factor, cond, etc.)
+        @{
+            @defgroup gesv_comp     --- full ---
+            @defgroup gecon         gecon:          condition number estimate
+            @defgroup getrf         getrf:          triangular factor
+            @defgroup getrf2        getrf2:         triangular factor panel, recursive?
+            @defgroup getf2         getf2:          triangular factor panel, level 2
+            @defgroup getrs         getrs:          triangular solve using factor
+            @defgroup getri         getri:          triangular inverse
+            @defgroup gerfs         gerfs:          iterative refinement
+            @defgroup gerfsx        gerfsx:         iterative refinement, expert
+            @defgroup geequ         geequ:          equilibration
+            @defgroup geequb        geequb:         equilibration, power of 2
+            @defgroup laqge         laqge:          row/col scale matrix
+            @defgroup laswp         laswp:          swap permutation
+
+            @defgroup getc2         getc2:          triangular factor, with complete pivoting
+            @defgroup gesc2         gesc2:          triangular solve using factor, with complete pivoting
+            @defgroup latdf         latdf:          Dif-estimate with complete pivoting LU, step in tgsen
+
+            @defgroup la_gercond    la_gercond: Skeel condition number estimate
+            @defgroup la_gerpvgrw   la_gerpvgrw: reciprocal pivot growth
+            @defgroup la_gerfsx_extended  la_gerfsx_extended:  step in gerfsx
+
+            @defgroup gbsv_comp     --- banded ---
+            @defgroup gbcon         gbcon:          condition number estimate
+            @defgroup gbtrf         gbtrf:          triangular factor
+            @defgroup gbtf2         gbtf2:          triangular factor, level 2
+            @defgroup gbtrs         gbtrs:          triangular solve using factor
+            @defgroup gbrfs         gbrfs:          iterative refinement
+            @defgroup gbrfsx        gbrfsx:         iterative refinement, expert
+            @defgroup gbequ         gbequ:          equilibration
+            @defgroup gbequb        gbequb:         equilibration, power of 2
+            @defgroup laqgb         laqgb:          row/col scale matrix
+            @defgroup la_gbrcond    la_gbrcond:     Skeel condition number estimate
+            @defgroup la_gbrpvgrw   la_gbrpvgrw:    reciprocal pivot growth
+            @defgroup la_gbrfsx_extended  la_gbrfsx_extended:  step in gbrfsx
+
+            @defgroup gtsv_comp     --- tridiagonal ---
+            @defgroup gtcon         gtcon:          condition number estimate
+            @defgroup gttrf         gttrf:          triangular factor
+            @defgroup gttrs         gttrs:          triangular solve using factor
+            @defgroup gtts2         gtts2:          triangular solve using factor
+            @defgroup gtrfs         gtrfs:          iterative refinement
+        @}
+
+        @defgroup posv_driver_grp   Cholesky: Hermitian/symmetric positive definite matrix, driver
+        @{
+            @defgroup posv_driver   --- full ---
+            @defgroup posv          posv:           factor and solve
+            @defgroup posvx         posvx:          factor and solve, expert
+            @defgroup posvxx        posvxx:         factor and solve, extra precise
+            @defgroup posv_mixed    posv:           factor and solve, mixed precision
+
+            @defgroup ppsv_driver   --- packed ---
+            @defgroup ppsv          ppsv:           factor and solve
+            @defgroup ppsvx         ppsvx:          factor and solve, expert
+
+            @defgroup pfsv_driver   --- rectangular full packed (RFP) ---
+            @defgroup pfsv          pfsv:           factor and solve [not available]
+
+            @defgroup pbsv_driver   --- banded ---
+            @defgroup pbsv          pbsv:           factor and solve
+            @defgroup pbsvx         pbsvx:          factor and solve, expert
+
+            @defgroup ptsv_driver   --- tridiagonal ---
+            @defgroup ptsv          ptsv:           factor and solve
+            @defgroup ptsvx         ptsvx:          factor and solve, expert
+        @}
+
+        @defgroup posv_comp_grp     Cholesky: computational routines (factor, cond, etc.)
+        @{
+            @defgroup posv_comp     --- full ---
+            @defgroup pocon         pocon:          condition number estimate
+            @defgroup potrf         potrf:          triangular factor
+            @defgroup potrf2        potrf2:         triangular factor panel, recursive?
+            @defgroup potf2         potf2:          triangular factor panel, level 2
+            @defgroup pstrf         pstrf:          triangular factor, with pivoting
+            @defgroup pstf2         pstf2:          triangular factor, with pivoting panel, level 2
+            @defgroup potrs         potrs:          triangular solve using factor
+            @defgroup potri         potri:          triangular inverse
+            @defgroup porfs         porfs:          iterative refinement
+            @defgroup porfsx        porfsx:         iterative refinement, expert
+            @defgroup poequ         poequ:          equilibration
+            @defgroup poequb        poequb:         equilibration, power of 2
+            @defgroup laqhe         laqhe:          row/col scale matrix
+            @defgroup la_porcond    la_porcond:     Skeel condition number estimate
+            @defgroup la_porpvgrw   la_porpvgrw:    reciprocal pivot growth
+            @defgroup la_porfsx_extended  la_porfsx_extended:  step in porfsx
+
+            @defgroup ppsv_comp     --- packed ---
+            @defgroup ppcon         ppcon:          condition number estimate
+            @defgroup pptrf         pptrf:          triangular factor
+            @defgroup pptrs         pptrs:          triangular solve using factor
+            @defgroup pptri         pptri:          triangular inverse
+            @defgroup pprfs         pprfs:          iterative refinement
+            @defgroup ppequ         ppequ:          equilibration
+            @defgroup laqhp         laqhp:          row/col scale matrix
+
+            @defgroup pfsv_comp     --- rectangular full packed (RFP) ---
+            @defgroup pftrf         pftrf:          triangular factor
+            @defgroup pftrs         pftrs:          triangular solve using factor
+            @defgroup pftri         pftri:          triangular inverse
+
+            @defgroup pbsv_comp     --- banded ---
+            @defgroup pbcon         pbcon:          condition number estimate
+            @defgroup pbtrf         pbtrf:          triangular factor
+            @defgroup pbtf2         pbtf2:          triangular factor panel, level 2
+            @defgroup pbtrs         pbtrs:          triangular solve using factor
+            @defgroup pbrfs         pbrfs:          iterative refinement
+            @defgroup pbequ         pbequ:          equilibration
+            @defgroup laqhb         laqhb:          row/col scale matrix
+
+            @defgroup ptsv_comp     --- tridiagonal ---
+            @defgroup ptcon         ptcon:          condition number estimate
+            @defgroup pttrf         pttrf:          triangular factor
+            @defgroup pttrs         pttrs:          triangular solve using factor
+            @defgroup ptts2         ptts2:          triangular solve using factor, unblocked
+            @defgroup ptrfs         ptrfs:          iterative refinement
+        @}
+
+        @defgroup hesv_driver_grp   LDL: Hermitian/symmetric indefinite matrix, driver
+        @{
+            @defgroup hesv_driver   --- full, rook pivoting ---
+            @defgroup hesv          {he,sy}sv:      rook (v1)
+            @defgroup hesv_rook     {he,sy}sv_rook: rook (v2)
+            @defgroup hesv_rk       {he,sy}sv_rk:   rook (v3)
+            @defgroup hesvx         {he,sy}svx:     rook (v1, expert)
+            @defgroup hesvxx        {he,sy}svxx:    rook (v1, expert)
+
+            @defgroup hpsv_driver   --- packed, rook pivoting ---
+            @defgroup hpsv          {hp,sp}sv:      factor and solve
+            @defgroup hpsvx         {hp,sp}svx:     factor and solve, expert
+
+            @defgroup hesv_aa_driver  --- full, Aasen ---
+            @defgroup hesv_aa       {he,sy}sv_aa:   Aasen
+            @defgroup hesv_aa_2stage  {he,sy}sv_aa_2stage:  Aasen, blocked 2-stage
+        @}
+
+        @defgroup hesv_comp_grp     LDL: computational routines (factor, cond, etc.)
+        @{
+            @defgroup hesv_comp_v1  --- full, rook v1 ---
+            @defgroup hecon         {he,sy}con:     condition number estimate
+            @defgroup hetrf         {he,sy}trf:     triangular factor
+            @defgroup lahef         la{he,sy}f:     step in hetrf
+            @defgroup hetf2         {he,sy}tf2:     triangular factor, level 2
+            @defgroup hetrs         {he,sy}trs:     triangular solve using factor
+            @defgroup hetri         {he,sy}tri:     triangular inverse
+            @defgroup herfs         {he,sy}rfs:     iterative refinement
+            @defgroup herfsx        {he,sy}rfsx:    iterative refinement, expert
+            @defgroup heequb        {he,sy}equb:    equilibration, power of 2
+            @defgroup syconv        syconv:         convert to/from L and D from hetrf
+
+            @defgroup hecon_3       {he,sy}con_3:   condition number estimate
+            @defgroup hetri2        {he,sy}tri2:    inverse
+            @defgroup hetri2x       {he,sy}tri2x:   inverse
+            @defgroup hetri_3       {he,sy}tri_3:   inverse
+            @defgroup hetri_3x      {he,sy}tri_3x:  inverse
+            @defgroup hetrs2        {he,sy}trs2:    solve using factor
+            @defgroup hetrs_3       {he,sy}trs_3:   solve using factor
+
+            @defgroup heswapr             {he,sy}swapr:         apply 2-sided permutation
+            @defgroup la_hercond          la_hercond:           Skeel condition number estimate
+            @defgroup la_herfsx_extended  la_herfsx_extended:   step in herfsx
+            @defgroup la_herpvgrw         la_herpvgrw:          reciprocal pivot growth
+
+            @defgroup hpsv_comp     --- packed, rook v1 ---
+            @defgroup hpcon         {hp,sp}con:     condition number estimate
+            @defgroup hptrf         {hp,sp}trf:     triangular factor
+            @defgroup hptrs         {hp,sp}trs:     triangular solve using factor
+            @defgroup hptri         {hp,sp}tri:     triangular inverse
+            @defgroup hprfs         {hp,sp}rfs:     iterative refinement
+
+            @defgroup hesv_comp_v2  --- full, rook v2 ---
+            @defgroup hecon_rook    {he,sy}con_rook:  condition number estimate
+            @defgroup hetrf_rook    {he,sy}trf_rook:  triangular factor
+            @defgroup lahef_rook    la{he,sy}f_rook:  triangular factor step
+            @defgroup hetf2_rook    {he,sy}tf2_rook:  triangular factor, level 2
+            @defgroup hetrs_rook    {he,sy}trs_rook:  triangular solve using factor
+            @defgroup hetri_rook    {he,sy}tri_rook:  triangular inverse
+
+            @defgroup hesv_comp_v3  --- full, rook v3 ---
+            @defgroup hetrf_rk      {he,sy}trf_rk:  triangular factor
+            @defgroup lahef_rk      la{he,sy}f_rk:  triangular factor step
+            @defgroup hetf2_rk      {he,sy}tf2_rk:  triangular factor, level 2
+            @defgroup syconvf       syconvf:        convert to/from hetrf      to hetrf_rk format
+            @defgroup syconvf_rook  syconvf_rook:   convert to/from hetrf_rook to hetrf_rk format
+
+            @defgroup hesv_comp_aasen  --- full, Aasen ---
+            @defgroup hetrf_aa      {he,sy}trf_aa:  triangular factor
+            @defgroup lahef_aa      la{he,sy}f_aa:  triangular factor partial factor
+            @defgroup hetrs_aa      {he,sy}trs_aa:  triangular solve using factor
+
+            @defgroup hesv_comp_aasen2 --- full, Aasen, blocked 2-stage ---
+            @defgroup hetrf_aa_2stage  {he,sy}trf_aa_2stage:  triangular factor
+            @defgroup hetrs_aa_2stage  {he,sy}trs_aa_2stage:  triangular solve using factor
+        @}
+
+        @defgroup trsv_comp_grp     Triangular computational routines (solve, cond, etc.)
+        @{
+            @defgroup trsv_comp     --- full ---
+            @defgroup trcon         trcon:          condition number estimate
+            @defgroup trtrs         trtrs:          triangular solve
+            @defgroup latrs         latrs:          triangular solve with robust scaling
+            @defgroup latrs3        latrs3:         triangular solve with robust scaling, level 3
+            @defgroup trtri         trtri:          triangular inverse
+            @defgroup trti2         trti2:          triangular inverse, level 2
+            @defgroup trrfs         trrfs:          triangular iterative refinement
+            @defgroup lauum         lauum:          triangular multiply: U^H U
+            @defgroup lauu2         lauu2:          triangular multiply: U^H U, level 2
+
+            @defgroup tpsv_comp     --- packed ---
+            @defgroup tpcon         tpcon:          condition number estimate
+            @defgroup tptrs         tptrs:          triangular solve
+            @defgroup latps         latps:          triangular solve with robust scaling
+            @defgroup tptri         tptri:          triangular inverse
+            @defgroup tprfs         tprfs:          triangular iterative refinement
+
+            @defgroup tfsv_comp     --- rectangular full packed (RFP) ---
+            @defgroup tftri         tftri:          triangular inverse, RFP
+
+            @defgroup tbsv_comp     --- banded ---
+            @defgroup tbcon         tbcon:          condition number estimate
+            @defgroup tbtrs         tbtrs:          triangular solve
+            @defgroup latbs         latbs:          triangular solve with scaling
+            @defgroup tbrfs         tbrfs:          triangular iterative refinement
+        @}
+
+        @defgroup solve_aux_grp     Auxiliary routines
+        @{
+            @defgroup lacn2         lacn2:          1-norm estimate, e.g., || A^{-1} ||_1 in gecon
+            @defgroup lacon         lacon:          1-norm estimate, e.g., || A^{-1} ||_1 in gecon, old
+            @defgroup la_lin_berr   la_lin_berr:    backward error
+        @}
+    @}
+
+    @defgroup gels_top Least squares
+    @{
+        @defgroup gels_driver_grp   Standard least squares, min || Ax - b ||_2
+        @{
+            @defgroup gels          gels:           least squares using QR/LQ
+            @defgroup gelst         gelst:          least squares using QR/LQ with T matrix
+            @defgroup gelss         gelss:          least squares using SVD, QR iteration
+            @defgroup gelsd         gelsd:          least squares using SVD, divide and conquer
+            @defgroup gelsy         gelsy:          least squares using complete orthogonal factor
+            @defgroup getsls        getsls:         least squares using tall-skinny QR/LQ
+        @}
+
+        @defgroup ggls_driver_grp   Constrained least squares
+        @{
+            @defgroup gglse         gglse:          equality-constrained least squares
+            @defgroup ggglm         ggglm:          Gauss-Markov linear model
+        @}
+
+        @defgroup gels_aux_grp      Auxiliary routines
+        @{
+            @defgroup laic1         laic1:          condition estimate, step in gelsy
+            @defgroup lals0         lals0:          back multiplying factors, step in gelsd
+            @defgroup lalsa         lalsa:          SVD of coefficient matrix, step in gelsd
+            @defgroup lalsd         lalsd:          uses SVD for least squares, step in gelsd
+        @}
+    @}
+
+    @defgroup unitary_top Orthogonal/unitary factors (QR, CS, etc.)
+    @{
+        @defgroup geqr_comp_grp     QR
+        @{
+            @defgroup geqr_comp1    --- flexible ---
+            @defgroup geqr          geqr:           QR factor, flexible
+            @defgroup gemqr         gemqr:          multiply by Q from geqr
+
+            @defgroup geqr_comp2    --- classic ---
+            @defgroup geqrf         geqrf:          QR factor
+            @defgroup geqr2         geqr2:          QR factor, level 2
+            @defgroup ungqr         {un,or}gqr:     generate explicit Q from geqrf
+            @defgroup ung2r         {un,or}g2r:     generate explicit Q from geqrf, level 2
+            @defgroup unmqr         {un,or}mqr:     multiply by Q from geqrf
+            @defgroup unm2r         {un,or}m2r:     multiply by Q from geqrf, level 2
+
+            @defgroup geqr_comp3    --- with T ---
+            @defgroup geqrt         geqrt:          QR factor, with T
+            @defgroup geqrt2        geqrt2:         QR factor, with T, level 2
+            @defgroup geqrt3        geqrt3:         QR factor, with T, recursive panel
+            @defgroup gemqrt        gemqrt:         multiply by Q from geqrt
+
+            @defgroup geqr_comp4    --- positive ---
+            @defgroup geqrfp        geqrfp:         QR factor, diag( R ) ≥ 0
+            @defgroup geqr2p        geqr2p:         QR factor, diag( R ) ≥ 0, level 2
+        @}
+
+        @defgroup geqpf_comp_grp    QR with pivoting
+        @{
+            @defgroup geqp3         geqp3:          QR factor with pivoting, level 3
+            @defgroup laqp2         laqp2:          step of geqp3
+            @defgroup laqps         laqps:          step of geqp3
+        @}
+
+        @defgroup getsqr_comp_grp   QR, tall-skinny
+        @{
+            @defgroup latsqr        latsqr:         tall-skinny QR factor
+            @defgroup ungtsqr       {un,or}gtsqr:   generate Q from latsqr
+            @defgroup ungtsqr_row   {un,or}gtsqr_row:   generate Q from latsqr
+            @defgroup larfb_gett    larfb_gett:     step in ungtsqr_row
+            @defgroup lamtsqr       lamtsqr:        multiply by Q from latsqr
+            @defgroup getsqrhrt     getsqrhrt:      tall-skinny QR factor, with Householder reconstruction
+            @defgroup unhr_col      {un,or}hr_col:  Householder reconstruction
+            @defgroup launhr_col_getrfnp   la{un,or}hr_col_getrfnp:   LU factor without pivoting
+            @defgroup launhr_col_getrfnp2  la{un,or}hr_col_getrfnp2:  LU factor without pivoting, level 2
+        @}
+
+        @defgroup tpqr_comp_grp     QR, triangular-pentagonal
+        @{
+            @defgroup tpqrt         tpqrt:          QR factor
+            @defgroup tpqrt2        tpqrt2:         QR factor, level 2
+            @defgroup tpmqrt        tpmqrt:         applies Q
+            @defgroup tprfb         tprfb:          applies Q (like larfb)
+        @}
+
+        @defgroup ggqr_comp_grp     Generalized QR
+        @{
+            @defgroup ggqrf         ggqrf:          Generalized QR factor
+        @}
+
+        @defgroup gelq_comp_grp     LQ
+        @{
+            @defgroup gelq_comp1    --- flexible ---
+            @defgroup gelq          gelq:           LQ factor, flexible
+            @defgroup gemlq         gemlq:          multiply by Q from gelq
+
+            @defgroup gelq_comp2    --- classic ---
+            @defgroup gelqf         gelqf:          LQ factor
+            @defgroup gelq2         gelq2:          LQ factor, level 2
+            @defgroup unglq         {un,or}glq:     generate explicit Q from gelqf
+            @defgroup ungl2         {un,or}gl2:     generate explicit Q, level 2, step in unglq
+            @defgroup unmlq         {un,or}mlq:     multiply by Q from gelqf
+            @defgroup unml2         {un,or}ml2:     multiply by Q, level 2, step in unmlq
+
+            @defgroup gelq_comp3    --- with T ---
+            @defgroup gelqt         gelqt:          LQ factor, with T
+            @defgroup gelqt3        gelqt3:         LQ factor, with T, recursive
+            @defgroup gemlqt        gemlqt:         multiply by Q from gelqt
+        @}
+
+        @defgroup geswlq_comp_grp   LQ, short-wide
+        @{
+            @defgroup laswlq        laswlq:         short-wide LQ factor
+            @defgroup lamswlq       lamswlq:        multiply by Q from laswlq
+        @}
+
+        @defgroup tplq_comp_grp     LQ, triangular-pentagonal
+        @{
+            @defgroup tplqt         tplqt:          QR factor
+            @defgroup tplqt2        tplqt2:         QR factor, level 2
+            @defgroup tpmlqt        tpmlqt:         applies Q
+        @}
+
+        @defgroup geql_comp_grp     QL
+        @{
+            @defgroup geqlf         geqlf:          QL factor
+            @defgroup geql2         geql2:          QL factor, level 2
+            @defgroup ungql         {un,or}gql:     generate explicit Q from geqlf
+            @defgroup unmql         {un,or}mql:     multiply by Q from geqlf
+            @defgroup ung2l         {un,or}g2l:     step in ungql
+            @defgroup unm2l         {un,or}m2l:     step in unmql
+        @}
+
+        @defgroup gerq_comp_grp     RQ
+        @{
+            @defgroup gerqf         gerqf:          RQ factor
+            @defgroup gerq2         gerq2:          RQ factor, level 2
+            @defgroup ungrq         {un,or}grq:     generate explicit Q from gerqf
+            @defgroup unmrq         {un,or}mrq:     multiply by Q from gerqf
+            @defgroup unmr2         {un,or}mr2:     step in unmrq
+            @defgroup ungr2         {un,or}gr2:     step in ungrq
+        @}
+
+        @defgroup ggrq_comp_grp     Generalized RQ
+        @{
+            @defgroup ggrqf         ggrqf:          Generalized RQ factor
+        @}
+
+        @defgroup gerz_comp_grp     RZ
+        @{
+            @defgroup tzrzf         tzrzf:          RZ factor
+            @defgroup latrz         latrz:          RZ factor step
+            @defgroup unmrz         {un,or}mrz:     multiply by Z from tzrzf
+            @defgroup unmr3         {un,or}mr3:     step in unmrz
+            @defgroup larz          larz:           apply reflector
+            @defgroup larzb         larzb:          apply block reflector
+            @defgroup larzt         larzt:          generate T matrix
+        @}
+
+        @defgroup gecs_comp_grp     Cosine-Sine (CS) decomposition
+        @{
+            @defgroup bbcsd         bbcsd:          ??
+            @defgroup uncsd         {un,or}csd:     ??
+            @defgroup uncsd2by1     {un,or}csd2by1: ??
+            @defgroup unbdb         {un,or}bdb:     bidiagonalize partitioned unitary matrix, step in uncsd
+            @defgroup unbdb1        {un,or}bdb1:    step in uncsd2by1
+            @defgroup unbdb2        {un,or}bdb2:    step in uncsd2by1
+            @defgroup unbdb3        {un,or}bdb3:    step in uncsd2by1
+            @defgroup unbdb4        {un,or}bdb4:    step in uncsd2by1
+            @defgroup unbdb5        {un,or}bdb5:    step in uncsd2by1
+            @defgroup unbdb6        {un,or}bdb6:    step in uncsd2by1
+
+            @defgroup lapmr         lapmr:          permute rows
+            @defgroup lapmt         lapmt:          permute cols
+        @}
+
+        @defgroup reflector_aux_grp Householder reflectors
+        @{
+            @defgroup larf          larf:           apply Householder reflector
+            @defgroup larfx         larfx:          apply Householder reflector, unrolled
+            @defgroup larfy         larfy:          apply Householder reflector symmetrically (2-sided)
+            @defgroup larfb         larfb:          apply block Householder reflector
+            @defgroup larfg         larfg:          generate Householder reflector
+            @defgroup larfgp        larfgp:         generate Householder reflector, beta ≥ 0
+            @defgroup larft         larft:          generate T matrix
+        @}
+
+        @defgroup rot_aux_grp       Givens/Jacobi plane rotations
+        @{
+            @defgroup lartg         lartg:          generate plane rotation, more accurate than BLAS rot
+            @defgroup lartgp        lartgp:         generate plane rotation, more accurate than BLAS rot
+            @defgroup lasr          lasr:           apply series of plane rotations
+            @defgroup largv         largv:          generate vector of plane rotations
+            @defgroup lartv         lartv:          apply vector of plane rotations to vectors
+            @defgroup lar2v         lar2v:          apply vector of plane rotations to 2x2 matrices
+            @defgroup lacrt         lacrt:          apply plane rotation (unused?)
+        @}
+    @}
+
+    @defgroup geev_top              Non-symmetric eigenvalues
+    @{
+        @defgroup geev_driver_grp   Standard eig driver, AV = VΛ
+        @{
+            @defgroup geev          geev:           eig
+            @defgroup geevx         geevx:          eig, expert
+
+            @defgroup gees          gees:           Schur form
+            @defgroup geesx         geesx:          Schur form, expert
+        @}
+
+        @defgroup ggev_driver_grp   Generalized eig driver
+        @{
+            @defgroup ggev3         ggev3:          eig
+            @defgroup ggev          ggev:           eig, unblocked
+            @defgroup ggevx         ggevx:          eig, expert
+
+            @defgroup gges3         gges3:          Schur form
+            @defgroup gges          gges:           Schur form, unblocked
+            @defgroup ggesx         ggesx:          Schur form, expert
+        @}
+
+        @defgroup gedmd             DMD driver, Dynamic Mode Decomposition
+
+        @defgroup geev_comp_grp     Eig computational routines
+        @{
+            @defgroup gebal         gebal:          balance matrix
+            @defgroup gehrd         gehrd:          reduction to Hessenberg
+            @defgroup gehd2         gehd2:          reduction to Hessenberg, level 2
+            @defgroup lahr2         lahr2:          step in gehrd
+            @defgroup unghr         {un,or}ghr:     generate Q from gehrd
+            @defgroup unmhr         {un,or}mhr:     multiply by Q from gehrd
+            @defgroup gebak         gebak:          back-transform eigvec
+            @defgroup hseqr         hseqr:          Hessenberg eig, QR iteration
+            @defgroup hsein         hsein:          Hessenberg inverse iteration for eigvec
+            @defgroup trevc         trevc:          eigenvectors of triangular Schur form, old
+            @defgroup trevc3        trevc3:         eigenvectors of triangular Schur form, blocked
+            @defgroup laln2         laln2:          1x1 or 2x2 solve, step in trevc
+
+            @defgroup trsyl         trsyl:          Sylvester equation
+            @defgroup trsyl3        trsyl3:         Sylvester equation, level 3
+            @defgroup lasy2         lasy2:          Sylvester equation
+
+            @defgroup trsna         trsna:          eig condition numbers
+            @defgroup laqtr         laqtr:          quasi-triangular solve
+
+            @defgroup trexc         trexc:          reorder Schur form
+            @defgroup trsen         trsen:          reorder Schur form
+            @defgroup laexc         laexc:          reorder Schur form
+
+            @defgroup lanv2         lanv2:          2x2 Schur factor
+
+            @defgroup laqr_group    --- hseqr auxiliary ---
+            @defgroup laein         laein:          eigvec by Hessenberg inverse iteration
+            @defgroup lahqr         lahqr:          eig of Hessenberg, step in hseqr
+            @defgroup laqr0         laqr0:          eig of Hessenberg, step in hseqr
+            @defgroup laqr1         laqr1:          step in hseqr
+            @defgroup laqr2         laqr2:          step in hseqr
+            @defgroup laqr3         laqr3:          step in hseqr
+            @defgroup laqr4         laqr4:          eig of Hessenberg, step in hseqr
+            @defgroup laqr5         laqr5:          step in hseqr
+
+            @defgroup iparmq        iparmq:         set parameters for hseqr
+
+            @defgroup laqz_group    --- ggev3, gges3 auxiliary ---
+            @defgroup laqz0         laqz0:          step in ggev3, gges3
+            @defgroup laqz1         laqz1:          step in ggev3, gges3
+            @defgroup laqz2         laqz2:          step in ggev3, gges3
+            @defgroup laqz3         laqz3:          step in ggev3, gges3
+            @defgroup laqz4         laqz4:          step in ggev3, gges3
+        @}
+
+        @defgroup ggev_comp_grp     Generalized eig computational routines
+        @{
+            @defgroup ggbal         ggbal:          balance matrix
+            @defgroup gghrd         gghrd:          reduction to Hessenberg
+            @defgroup gghd3         gghd3:          reduction to Hessenberg, level 3
+            @defgroup hgeqz         hgeqz:          generalized Hessenberg eig
+            @defgroup ggbak         ggbak:          back-transform eigvec
+            @defgroup tgsen         tgsen:          reorder generalized Schur form
+            @defgroup tgsna         tgsna:          reciprocal cond est
+            @defgroup tgsyl         tgsyl:          Sylvester equation
+            @defgroup tgsy2         tgsy2:          Sylvester equation panel (?)
+            @defgroup unm22         {un,or}m22:     multiply by banded Q, step in gghd3
+            @defgroup lagv2         lagv2:          2x2 generalized Schur factor
+            @defgroup tgevc         tgevc:          eigvec of pair of matrices
+            @defgroup tgexc         tgexc:          reorder generalized Schur form
+            @defgroup tgex2         tgex2:          reorder generalized Schur form
+        @}
+    @}
+
+    @defgroup heev_top              Hermitian/symmetric eigenvalues
+    @{
+        @defgroup heev_driver_grp   Standard eig driver, AV = VΛ
+        @{
+            @defgroup heev_driver   --- full ---
+            @defgroup heev          {he,sy}ev:      eig, QR iteration
+            @defgroup heevd         {he,sy}evd:     eig, divide and conquer
+            @defgroup heevr         {he,sy}evr:     eig, MRRR
+            @defgroup heevx         {he,sy}evx:     eig, bisection
+
+            @defgroup heev_driver2  --- full, 2-stage ---
+            @defgroup heev_2stage   {he,sy}ev_2stage:   eig, QR iteration
+            @defgroup heevd_2stage  {he,sy}evd_2stage:  eig, divide and conquer
+            @defgroup heevr_2stage  {he,sy}evr_2stage:  eig, MRRR
+            @defgroup heevx_2stage  {he,sy}evx_2stage:  eig, bisection
+
+            @defgroup hpev_driver   --- packed ---
+            @defgroup hpev          {hp,sp}ev:      eig, QR iteration
+            @defgroup hpevd         {hp,sp}evd:     eig, divide and conquer
+            @defgroup hpevx         {hp,sp}evx:     eig, bisection
+
+            @defgroup hbev_driver   --- banded ---
+            @defgroup hbev          {hb,sb}ev:      eig, QR iteration
+            @defgroup hbevd         {hb,sb}evd:     eig, divide and conquer
+            @defgroup hbevx         {hb,sb}evx:     eig, bisection
+
+            @defgroup hbev_driver2  --- banded, 2nd-stage ---
+            @defgroup hbev_2stage   {hb,sb}ev_2stage:   eig, QR iteration
+            @defgroup hbevd_2stage  {hb,sb}evd_2stage:  eig, divide and conquer
+            @defgroup hbevx_2stage  {hb,sb}evx_2stage:  eig, bisection
+
+            @defgroup stev_driver   --- tridiagonal ---
+            @defgroup stev          stev:           eig, QR iteration
+            @defgroup stevd         stevd:          eig, divide and conquer
+            @defgroup stevr         stevr:          eig, MRRR
+            @defgroup stevx         stevx:          eig, bisection
+            @defgroup pteqr         pteqr:          eig, positive definite tridiagonal
+
+            @defgroup stebz         stebz:          eig, Kahan
+            @defgroup sterf         sterf:          eig, QR iteration
+            @defgroup stedc         stedc:          eig, divide and conquer
+            @defgroup stegr         stegr:          eig, bisection, see stemr
+            @defgroup stein         stein:          eig, inverse iteration
+            @defgroup stemr         stemr:          eig, relatively robust representation (RRR)
+            @defgroup steqr         steqr:          eig, QR iteration
+        @}
+
+        @defgroup hegv_driver_grp   Generalized eig driver, AV = BVΛ, etc.
+        @{
+            @defgroup hegv_driver   --- full ---
+            @defgroup hegv          {he,sy}gv:      eig, QR iteration
+            @defgroup hegv_2stage   {he,sy}gv_2stage: eig, QR iteration, 2-stage
+            @defgroup hegvd         {he,sy}gvd:     eig, divide and conquer
+            @defgroup hegvx         {he,sy}gvx:     eig, bisection
+
+            @defgroup hpgv_driver   --- packed ---
+            @defgroup hpgv          {hp,sp}gv:      eig, QR iteration
+            @defgroup hpgvd         {hp,sp}gvd:     eig, divide and conquer
+            @defgroup hpgvx         {hp,sp}gvx:     eig, bisection
+
+            @defgroup hbgv_driver   --- banded ---
+            @defgroup hbgv          {hb,sb}gv:      eig, QR iteration
+            @defgroup hbgvd         {hb,sb}gvd:     eig, divide and conquer
+            @defgroup hbgvx         {hb,sb}gvx:     eig, bisection
+        @}
+
+        @defgroup heev_comp_grp     Eig computational routines
+        @{
+            @defgroup heev_comp     --- full ---
+            @defgroup disna         disna:          eig condition numbers
+            @defgroup hetrd         {he,sy}trd:     reduction to tridiagonal
+            @defgroup hetd2         {he,sy}td2:     reduction to tridiagonal, level 2
+            @defgroup latrd         latrd:          step in hetrd
+            @defgroup ungtr         {un,or}gtr:     generate Q from hetrd
+            @defgroup unmtr         {un,or}mtr:     multiply by Q from hetrd
+
+            @defgroup hetrd_2stage  {he,sy}trd_2stage: reduction to tridiagonal, 2-stage
+            @defgroup hetrd_he2hb   {he,sy}trd_he2hb:   full to band (1st stage)
+            @defgroup hetrd_hb2st   {he,sy}trd_hb2st:   band to tridiagonal (2nd stage)
+            @defgroup hb2st_kernels {hb,sb}2st_kernels: band to tridiagonal (2nd stage)
+
+            @defgroup lae2          lae2:           2x2 eig, step in steqr, stemr
+            @defgroup laesy         laesy:          2x2 eig
+            @defgroup laev2         laev2:          2x2 eig
+            @defgroup lagtf         lagtf:          LU factor of (T - λI)
+            @defgroup lagts         lagts:          LU solve  of (T - λI) x = y
+
+            @defgroup hpev_comp     --- packed ---
+            @defgroup hptrd         {hp,sp}trd:     reduction to tridiagonal
+            @defgroup upgtr         {up,op}gtr:     generate Q from hetrd
+            @defgroup upmtr         {up,op}mtr:     multiply by Q from hptrd
+
+            @defgroup hbev_comp     --- banded ---
+            @defgroup hbtrd         {hb,sb}trd:     reduction to tridiagonal
+        @}
+
+        @defgroup hegv_comp_grp     Generalized eig computational routines
+        @{
+            @defgroup hegst         {he,sy}gst:     reduction to standard form
+            @defgroup hegs2         {he,sy}gs2:     reduction to standard form, level 2
+            @defgroup hpgst         {hp,sp}gst:     reduction to standard form, packed
+            @defgroup hbgst         {hb,sb}gst:     reduction to standard form, banded
+            @defgroup pbstf         pbstf:          split Cholesky factor, use with hbgst
+            @defgroup lag2          lag2:           2x2 eig
+        @}
+
+        @defgroup stev_comp_grp     tridiag bisection routines
+        @{
+            @defgroup laebz         laebz:          counts eigvals <= value
+            @defgroup laneg         laneg:          Sturm count
+        @}
+
+        @defgroup laed_comp_grp     tridiag divide and conquer (D&C) routines
+        @{
+            @defgroup laed0         laed0:          D&C step: top level solver
+            @defgroup laed1         laed1:          D&C step: merge subproblems
+            @defgroup laed2         laed2:          D&C step: deflation
+            @defgroup laed3         laed3:          D&C step: secular equation
+            @defgroup laed4         laed4:          D&C step: secular equation nonlinear solver
+            @defgroup laed5         laed5:          D&C step: secular equation, 2x2
+            @defgroup laed6         laed6:          D&C step: secular equation Newton step
+            @defgroup lamrg         lamrg:          permutation to merge 2 sorted lists
+
+            @defgroup laed_comp2    --- eig value only or update Q ---
+            @defgroup laed7         laed7:          D&C step: merge subproblems
+            @defgroup laed8         laed8:          D&C step: deflation
+            @defgroup laed9         laed9:          D&C step: secular equation
+            @defgroup laeda         laeda:          D&C step: z vector
+        @}
+
+        @defgroup larr_comp_grp     tridiag RRR routines
+        @{
+            @defgroup larra         larra:          step in stemr
+            @defgroup larrb         larrb:          step in stemr
+            @defgroup larrc         larrc:          step in stemr
+            @defgroup larrd         larrd:          step in stemr, tridiag eig
+            @defgroup larre         larre:          step in stemr
+            @defgroup larrf         larrf:          step in stemr, find relative robust representation (RRR)
+            @defgroup larrj         larrj:          step in stemr, refine eigval estimates
+            @defgroup larrk         larrk:          step in stemr, compute one eigval
+            @defgroup larrr         larrr:          step in stemr, test to do expensive tridiag eig algorithm
+            @defgroup larrv         larrv:          eig tridiagonal, step in stemr & stegr
+            @defgroup lar1v         lar1v:          step in larrv, hence stemr & stegr
+        @}
+    @}
+
+    @defgroup svd_top               Singular Value Decomposition (SVD)
+    @{
+        @defgroup svd_driver_grp    Standard SVD driver, A = UΣV^H
+        @{
+            @defgroup gesvd_driver  --- full ---
+            @defgroup gesvd         gesvd:          SVD, QR iteration
+            @defgroup gesvdq        gesvdq:         SVD, QR with pivoting
+            @defgroup gesdd         gesdd:          SVD, divide and conquer
+            @defgroup gesvdx        gesvdx:         SVD, bisection
+            @defgroup gejsv         gejsv:          SVD, Jacobi, high-level
+            @defgroup gesvj         gesvj:          SVD, Jacobi, low-level
+
+            @defgroup bdsvd_driver  --- bidiagonal ---
+            @defgroup bdsqr         bdsqr:          bidiagonal SVD, QR iteration (dqds)
+            @defgroup bdsdc         bdsdc:          bidiagonal SVD, divide and conquer
+            @defgroup bdsvdx        bdsvdx:         bidiagonal SVD, bisection
+        @}
+
+        @defgroup ggsvd_driver_grp  Generalized SVD driver
+        @{
+            @defgroup ggsvd3        ggsvd3:         SVD, QR iteration
+        @}
+
+        @defgroup gesvd_comp_grp    SVD computational routines
+        @{
+            @defgroup gebrd         gebrd:          reduction to bidiagonal
+            @defgroup gebd2         gebd2:          reduction to bidiagonal, level 2
+            @defgroup labrd         labrd:          step in gebrd
+            @defgroup gbbrd         gbbrd:          band to bidiagonal
+            @defgroup ungbr         {un,or}gbr:     generate Q, P from gebrd
+            @defgroup unmbr         {un,or}mbr:     multiply by Q, P from gebrd
+
+            @defgroup gesvd_aux     --- auxiliary routines ---
+            @defgroup gsvj0         gsvj0:          step in gesvj
+            @defgroup gsvj1         gsvj1:          step in gesvj
+            @defgroup las2          las2:           2x2 triangular SVD
+            @defgroup lasv2         lasv2:          2x2 triangular SVD
+            @defgroup lartgs        lartgs:         generate plane rotation for bidiag SVD
+        @}
+
+        @defgroup ggsvd_comp_grp    Generalized SVD computational routines
+        @{
+            @defgroup ggsvp3        ggsvp3:         step in ggsvd
+            @defgroup tgsja         tgsja:          generalized SVD of trapezoidal matrices, step in ggsvd3
+            @defgroup lags2         lags2:          2x2 orthogonal factor, step in tgsja
+            @defgroup lapll         lapll:          linear dependence of 2 vectors
+        @}
+
+        @defgroup lasq_comp_grp     bidiag QR iteration routines
+        @{
+            @defgroup lasq1         lasq1:          dqds step
+            @defgroup lasq2         lasq2:          dqds step
+            @defgroup lasq3         lasq3:          dqds step
+            @defgroup lasq4         lasq4:          dqds step
+            @defgroup lasq5         lasq5:          dqds step
+            @defgroup lasq6         lasq6:          dqds step
+        @}
+
+        @defgroup lasd_comp_grp     bidiag D&C routines
+        @{
+            @defgroup lasd0         lasd0:          D&C step: top level solver
+            @defgroup lasdt         lasdt:          D&C step: tree
+            @defgroup lasd1         lasd1:          D&C step: merge subproblems
+            @defgroup lasd2         lasd2:          D&C step: deflation
+            @defgroup lasd3         lasd3:          D&C step: secular equation
+            @defgroup lasd4         lasd4:          D&C step: secular equation nonlinear solver
+            @defgroup lasd5         lasd5:          D&C step: secular equation, 2x2
+            @defgroup lasdq         lasdq:          D&C step: leaf using bdsqr
+
+            @defgroup lasd_comp2    --- singular values only or factored form ---
+            @defgroup lasda         lasda:          D&C step: top level solver
+            @defgroup lasd6         lasd6:          D&C step: merge subproblems
+            @defgroup lasd7         lasd7:          D&C step: deflation
+            @defgroup lasd8         lasd8:          D&C step: secular equation
+        @}
+    @}
+
+    @defgroup blas_like_top         BLAS-like
+    @{
+        @defgroup set_grp           Initialize, copy, convert
+        @{
+            @defgroup laset         laset:          set matrix
+            @defgroup larnv         larnv:          random vector
+            @defgroup laruv         laruv:          random uniform vector
+
+            @defgroup lacpy         lacpy:          copy matrix
+
+            @defgroup lacp2         lacp2:          general matrix, convert real to complex
+            @defgroup _lag2_        _lag2_:         general matrix, convert double <=> single
+            @defgroup _lat2_        _lat2_:         triangular matrix, convert double <=> single
+
+            @defgroup tfttp         tfttp:          triangular matrix, RFP    (tf) to packed (tp)
+            @defgroup tfttr         tfttr:          triangular matrix, RFP    (tf) to full   (tr)
+            @defgroup tpttf         tpttf:          triangular matrix, packed (tp) to RFP    (tf)
+            @defgroup tpttr         tpttr:          triangular matrix, packed (tp) to full   (tr)
+            @defgroup trttf         trttf:          triangular matrix, full   (tr) to RFP    (tf)
+            @defgroup trttp         trttp:          triangular matrix, full   (tr) to packed (tp)
+        @}
+
+        @defgroup norm_grp          Matrix norm
+        @{
+            @defgroup lange         lange:          general matrix
+            @defgroup langb         langb:          general matrix, banded
+            @defgroup langt         langt:          general matrix, tridiagonal
+
+            @defgroup lanhs         lanhs:          Hessenberg
+
+            @defgroup lanhe         lan{he,sy}:     Hermitian/symmetric matrix
+            @defgroup lanhf         lan{hf,sf}:     Hermitian/symmetric matrix, RFP
+            @defgroup lanhp         lan{hp,sp}:     Hermitian/symmetric matrix, packed
+            @defgroup lanhb         lan{hb,sb}:     Hermitian/symmetric matrix, banded
+            @defgroup lanht         lan{ht,st}:     Hermitian/symmetric matrix, tridiagonal
+
+            @defgroup lantr         lantr:          triangular matrix
+            @defgroup lantp         lantp:          triangular matrix, packed
+            @defgroup lantb         lantb:          triangular matrix, banded
+        @}
+
+        @defgroup blas0_like_grp    Scalar operations
+        @{
+            @defgroup isnan         isnan:          test for NaN
+            @defgroup laisnan       laisnan:        test for NaN, unoptimized
+            @defgroup ladiv         ladiv:          complex divide
+            @defgroup lapy2         lapy2:          robust sqrt( x^2 + y^2 )
+            @defgroup lapy3         lapy3:          robust sqrt( x^2 + y^2 + z^2 )
+            @defgroup larmm         larmm:          scale factor to avoid overflow, step in latrs
+        @}
+
+        @defgroup blas1_like_grp    Level 1 BLAS-like vector ops
+        @{
+            @defgroup lacgv         lacgv:          conjugate vector
+            @defgroup lasrt         lasrt:          sort vector
+            @defgroup lassq         lassq:          sum-of-squares, avoiding over/underflow
+            @defgroup rscl          rscl:           scale vector by reciprocal
+        @}
+
+        @defgroup blas2_like_grp    Level 2 BLAS-like matrix-vector ops
+        @{
+            @defgroup ilalc         ilalc:          find non-zero col
+            @defgroup ilalr         ilalr:          find non-zero row
+            @defgroup lascl         lascl:          scale matrix
+            @defgroup la_geamv      la_geamv:       matrix-vector multiply |A| * |x|, general
+            @defgroup la_gbamv      la_gbamv:       matrix-vector multiply |A| * |x|, general banded
+            @defgroup la_heamv      la_heamv:       matrix-vector multiply |A| * |x|, Hermitian/symmetric
+            @defgroup lascl2        lascl2:         diagonal scale matrix, A = D A
+            @defgroup larscl2       larscl2:        reciprocal diagonal scale matrix, A = D^{-1} A
+            @defgroup la_wwaddw     la_wwaddw:      add to double-double or single-single vector
+        @}
+
+        @defgroup blas3_like_grp    Level 3 BLAS-like matrix-matrix ops
+        @{
+            @defgroup lagtm         lagtm:          tridiagonal matrix-matrix multiply
+            @defgroup lacrm         lacrm:          complex * real matrix-matrix multiply
+            @defgroup larcm         larcm:          real * complex matrix-matrix multiply
+            @defgroup hfrk          hfrk:           Hermitian rank-k update, RFP format
+            @defgroup tfsm          tfsm:           triangular-matrix solve, RFP format
+        @}
+    @}
+
+    @defgroup aux_top               Auxiliary routines
+    @{
+        @defgroup aux_grp           Other auxiliary routines
+        @{
+            @defgroup lsame         lsame:          string comparison
+            @defgroup lsamen        lsamen:         string comparison
+            @defgroup roundup_lwork roundup_lwork:  fix rounding integer to float
+            @defgroup second        second:         wall clock timer
+        @}
+
+        @defgroup params_grp        Parameters
+        @{
+            @defgroup lamch         lamch:          machine parameters
+            @defgroup lamc1         lamc1:          ??
+            @defgroup lamc2         lamc2:          ??
+            @defgroup lamc3         lamc3:          ??
+            @defgroup lamc4         lamc4:          ??
+            @defgroup lamc5         lamc5:          ??
+            @defgroup labad         labad:          over/underflow on obsolete pre-IEEE machines
+            @defgroup ilaver        ilaver:         LAPACK version
+            @defgroup ilaenv        ilaenv:         tuning parameters
+            @defgroup ilaenv2stage  ilaenv2stage:   tuning parameters for 2-stage eig
+            @defgroup iparam2stage  iparam2stage:   sets parameters for 2-stage eig
+            @defgroup ieeeck        ieeeck:         verify inf and NaN are safe
+            @defgroup la_constants  la_constants:   Fortran 95 module of constants
+
+            @defgroup blast_aux     --- BLAST constants ---
+            @defgroup iladiag       iladiag:        diag      string to BLAST const
+            @defgroup ilaprec       ilaprec:        precision string to BLAST const
+            @defgroup ilatrans      ilatrans:       trans     string to BLAST const
+            @defgroup ilauplo       ilauplo:        uplo      string to BLAST const
+            @defgroup la_transtype  la_transtype:   BLAST const to string
+        @}
+
+        @defgroup xerbla_grp        Error reporting
+        @{
+            @defgroup xerbla        xerbla:         error reporting
+            @defgroup xerbla_array  xerbla_array:   error reporting, callable from C
+        @}
+    @}
+@}
+
+@defgroup blas_top BLAS
+BLAS are defined by three papers:
+Basic linear algebra subprograms for {FORTRAN} usage, Lawson et al, 1979.
+An extended set of {FORTRAN} basic linear algebra subprograms, Dongarra et al, 1988.
+A set of level 3 basic linear algebra subprograms, Dongarra et al, 1990.
+Some BLAS-like routines (e.g., csymv, crot, csum1, icmax1) exist in
+LAPACK rather than the classic BLAS.
+These were extended by the Extra Precision BLAS (XBLAS, not documented here)
+https://www.netlib.org/xblas/
+
+@{
+        @defgroup scalar_grp        Scalar operations
+        @{
+            @defgroup abs1          abs1:           | real( x ) | + | imag( x ) |
+        @}
+
+        @defgroup blas1_grp         Level 1 BLAS: vector ops
+        @{
+            @defgroup asum          asum:           sum | real( x_i ) | + | imag( x_i ) |
+            @defgroup sum1          sum1:           sum | x_i | (in LAPACK)
+            @defgroup axpy          axpy:           y = ax + y
+            @defgroup copy          copy:           y = x
+            @defgroup dot           dot:            x^H x and x^T x
+            @defgroup iamax         iamax:          argmax_i | real( x_i ) | + | imag( x_i ) |
+            @defgroup imax1         imax1:          argmax_i | x_i | (in LAPACK)
+            @defgroup nrm2          nrm2:           || x ||_2
+            @defgroup scal          scal:           x = alpha x
+            @defgroup swap          swap:           x <=> y
+
+            @defgroup rot_comp      --- Givens/Jacobi plane rotations ---
+            @defgroup rot           rot:            apply plane rotation ([cz]rot in LAPACK)
+            @defgroup rotg          rotg:           generate plane rotation (cf. lartg)
+            @defgroup rotm          rotm:           apply modified (fast) plane rotation
+            @defgroup rotmg         rotmg:          generate modified (fast) plane rotation
+        @}
+
+        @defgroup blas2_grp         Level 2 BLAS: matrix-vector ops
+        @{
+            @defgroup blas2_full    --- full ---
+            @defgroup gemv          gemv:           general matrix-vector multiply
+            @defgroup ger           ger:            general matrix rank-1 update
+
+            @defgroup hemv          {he,sy}mv:      Hermitian/symmetric matrix-vector multiply ([cz]symv in LAPACK)
+            @defgroup her           {he,sy}r:       Hermitian/symmetric rank-1 update
+            @defgroup her2          {he,sy}r2:      Hermitian/symmetric rank-2 update
+
+            @defgroup trmv          trmv:           triangular matrix-vector multiply
+            @defgroup trsv          trsv:           triangular matrix-vector solve
+
+            @defgroup blas2_packed  --- packed ---
+            @defgroup hpmv          {hp,sp}mv:      Hermitian/symmetric matrix-vector multiply
+            @defgroup hpr           {hp,sp}r:       Hermitian/symmetric rank-1 update
+            @defgroup hpr2          {hp,sp}r2:      Hermitian/symmetric rank-2 update
+
+            @defgroup tpmv          tpmv:           triangular matrix-vector multiply
+            @defgroup tpsv          tpsv:           triangular matrix-vector solve
+
+            @defgroup blas2_banded  --- banded ---
+            @defgroup gbmv          gbmv:           general matrix-vector multiply
+
+            @defgroup hbmv          {hb,sb}mv:      Hermitian/symmetric matrix-vector multiply
+
+            @defgroup tbmv          tbmv:           triangular matrix-vector multiply
+            @defgroup tbsv          tbsv:           triangular matrix-vector solve
+        @}
+
+        @defgroup blas3_grp         Level 3 BLAS: matrix-matrix ops
+        @{
+            @defgroup gemm          gemm:           general matrix-matrix multiply
+
+            @defgroup hemm          {he,sy}mm:      Hermitian/symmetric matrix-matrix multiply
+            @defgroup herk          {he,sy}rk:      Hermitian/symmetric rank-k update
+            @defgroup her2k         {he,sy}r2k:     Hermitian/symmetric rank-2k update
+
+            @defgroup trmm          trmm:           triangular matrix-matrix multiply
+            @defgroup trsm          trsm:           triangular matrix-matrix solve
+        @}
+@}
+
 **/
diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex
index 794c2a7aa..668ca8d68 100644
--- a/lapack-netlib/DOCS/lawn81.tex
+++ b/lapack-netlib/DOCS/lawn81.tex
@@ -575,7 +575,7 @@ There are six machine-dependent functions in the test and timing
 package, at least three of which must be installed.  They are
 
 \begin{tabbing}
-MONOMO  \=  DOUBLE PRECYSION  \=  \kill
+MONOMO  \=  DOUBLE PRECISION  \=  \kill
 LSAME   \>  LOGICAL      \> Test if two characters are the same regardless of case \\
 SLAMCH  \>  REAL  \> Determine machine-dependent parameters \\
 DLAMCH  \>  DOUBLE PRECISION \> Determine machine-dependent parameters \\
diff --git a/lapack-netlib/INSTALL/dlamch.c b/lapack-netlib/INSTALL/dlamch.c
index 744130a87..ce6b76a32 100644
--- a/lapack-netlib/INSTALL/dlamch.c
+++ b/lapack-netlib/INSTALL/dlamch.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,217 +273,7 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
+
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/INSTALL/droundup_lwork.c b/lapack-netlib/INSTALL/droundup_lwork.c
index d05923a46..6d1f39a12 100644
--- a/lapack-netlib/INSTALL/droundup_lwork.c
+++ b/lapack-netlib/INSTALL/droundup_lwork.c
@@ -223,7 +223,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -237,145 +236,5 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
 #endif
diff --git a/lapack-netlib/INSTALL/dsecnd_INT_ETIME.c b/lapack-netlib/INSTALL/dsecnd_INT_ETIME.c
index 36853ef4b..e4230a6e4 100644
--- a/lapack-netlib/INSTALL/dsecnd_INT_ETIME.c
+++ b/lapack-netlib/INSTALL/dsecnd_INT_ETIME.c
@@ -223,7 +223,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -237,149 +236,10 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
 #endif
-/*  -- translated by f2c (version 20000121).
+
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/INSTALL/ilaver.c b/lapack-netlib/INSTALL/ilaver.c
index b274af292..d64c841a2 100644
--- a/lapack-netlib/INSTALL/ilaver.c
+++ b/lapack-netlib/INSTALL/ilaver.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
@@ -556,7 +315,6 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 /* > \author Univ. of Colorado Denver */
 /* > \author NAG Ltd. */
 
-/* > \date November 2019 */
 
 /* > \ingroup auxOTHERauxiliary */
 
@@ -573,7 +331,7 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 /*  ===================================================================== */
     *vers_major__ = 3;
-    *vers_minor__ = 11;
+    *vers_minor__ = 12;
     *vers_patch__ = 0;
 /*  ===================================================================== */
 
diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f
index a246c37cb..1827d5cd2 100644
--- a/lapack-netlib/INSTALL/ilaver.f
+++ b/lapack-netlib/INSTALL/ilaver.f
@@ -44,7 +44,6 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2019
 *
 *> \ingroup auxOTHERauxiliary
 *
@@ -60,7 +59,7 @@
       INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH
 *  =====================================================================
       VERS_MAJOR = 3
-      VERS_MINOR = 11
+      VERS_MINOR = 12
       VERS_PATCH = 0
 *  =====================================================================
 *
diff --git a/lapack-netlib/INSTALL/lsametst.c b/lapack-netlib/INSTALL/lsametst.c
index 4b46115fc..631733841 100644
--- a/lapack-netlib/INSTALL/lsametst.c
+++ b/lapack-netlib/INSTALL/lsametst.c
@@ -426,7 +426,7 @@ static integer c__3 = 3;
 /*     December 2016 */
 
 /*  ===================================================================== */
-/* Main program */ main(void)
+/* Main program */ int main(void)
 {
     /* Format strings */
     static char fmt_9999[] = "(\002 *** Error:  LSAME( \002,a1,\002, \002,"
diff --git a/lapack-netlib/INSTALL/second_INT_ETIME.c b/lapack-netlib/INSTALL/second_INT_ETIME.c
index 51b7ddb54..f2ad529e3 100644
--- a/lapack-netlib/INSTALL/second_INT_ETIME.c
+++ b/lapack-netlib/INSTALL/second_INT_ETIME.c
@@ -223,7 +223,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -237,149 +236,10 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
 #endif
-/*  -- translated by f2c (version 20000121).
+
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/INSTALL/secondtst.c b/lapack-netlib/INSTALL/secondtst.c
index 694679bb5..03e7814e9 100644
--- a/lapack-netlib/INSTALL/secondtst.c
+++ b/lapack-netlib/INSTALL/secondtst.c
@@ -422,7 +422,7 @@ static integer c__1000 = 1000;
 
 /* ===================================================================== */
 
-/* Main program */ main(void)
+/* Main program */ int main(void)
 {
     /* Format strings */
     static char fmt_9999[] = "(\002 Time for \002,g10.3,\002 SAXPY ops = "
diff --git a/lapack-netlib/INSTALL/slamch.c b/lapack-netlib/INSTALL/slamch.c
index a000ed664..2def9337a 100644
--- a/lapack-netlib/INSTALL/slamch.c
+++ b/lapack-netlib/INSTALL/slamch.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,11 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
@@ -279,229 +273,6 @@ static float spow_ui(float x, integer n) {
 	}
 	return pow;
 }
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/INSTALL/sroundup_lwork.c b/lapack-netlib/INSTALL/sroundup_lwork.c
index d05923a46..6d1f39a12 100644
--- a/lapack-netlib/INSTALL/sroundup_lwork.c
+++ b/lapack-netlib/INSTALL/sroundup_lwork.c
@@ -223,7 +223,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -237,145 +236,5 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
 #endif
diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
index 3e7f9de5b..28f8ad655 100644
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -3323,6 +3323,216 @@ void LAPACK_zgesdd_base(
     #define LAPACK_zgesdd(...) LAPACK_zgesdd_base(__VA_ARGS__)
 #endif
 
+#define LAPACK_cgedmd_base LAPACK_GLOBAL(cgedmd,CGEDMD)
+void LAPACK_cgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* x, lapack_int const* ldx,
+    lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const float* tol, lapack_int* k, lapack_complex_float* eigs,
+    lapack_complex_float* z, lapack_int const* ldz, float* res,
+    lapack_complex_float* b, lapack_int const* ldb,
+    lapack_complex_float* w, lapack_int const* ldw,
+    lapack_complex_float* s, lapack_int const* lds,
+    lapack_complex_float* zwork, lapack_int const* lzwork,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__)
+#endif
+
+
+#define LAPACK_dgedmd_base LAPACK_GLOBAL(dgedmd,DGEDMD)
+void LAPACK_dgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    double* x, lapack_int const* ldx,
+    double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const double* tol, lapack_int* k, double* reig, double* imeig,
+    double* z, lapack_int const* ldz, double* res,
+    double* b, lapack_int const* ldb,
+    double* w, lapack_int const* ldw,
+    double* s, lapack_int const* lds,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_sgedmd_base LAPACK_GLOBAL(sgedmd,SGEDMD)
+void LAPACK_sgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    float* x, lapack_int const* ldx,
+    float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const float* tol, lapack_int* k, float* reig, float *imeig,
+    float* z, lapack_int const* ldz, float* res,
+    float* b, lapack_int const* ldb,
+    float* w, lapack_int const* ldw,
+    float* s, lapack_int const* lds,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_zgedmd_base LAPACK_GLOBAL(zgedmd,ZGEDMD)
+void LAPACK_zgedmd_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobf,
+    lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* x, lapack_int const* ldx,
+    lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    const double* tol, lapack_int *k, lapack_complex_double* eigs,
+    lapack_complex_double* z, lapack_int const* ldz, double* res,
+    lapack_complex_double* b, lapack_int const* ldb,
+    lapack_complex_double* w, lapack_int const* ldw,
+    lapack_complex_double* s, lapack_int const* lds,
+    lapack_complex_double* zwork, lapack_int const* lzwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__, 1, 1, 1, 1)
+#else
+    #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_cgedmdq_base LAPACK_GLOBAL(cgedmdq,CGEDMDQ)
+void LAPACK_cgedmdq_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* f, lapack_int const* ldf,
+    lapack_complex_float* x, lapack_int const* ldx,
+    lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    float const* tol, lapack_int const* k,
+    lapack_complex_float* eigs,
+    lapack_complex_float* z, lapack_int const* ldz, float* res,
+    lapack_complex_float* b, lapack_int const* ldb,
+    lapack_complex_float* v, lapack_int const* ldv,
+    lapack_complex_float* s, lapack_int const* lds,
+    lapack_complex_float* zwork, lapack_int const* lzwork,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_dgedmdq_base LAPACK_GLOBAL(dgedmdq,DGEDMDQ)
+void LAPACK_dgedmdq_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    double* f, lapack_int const* ldf,
+    double* x, lapack_int const* ldx,
+    double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    double const* tol, lapack_int* k,
+    double* reig, double *imeig,
+    double* z, lapack_int const* ldz, double* res,
+    double* b, lapack_int const* ldb,
+    double* v, lapack_int const* ldv,
+    double* s, lapack_int const* lds,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_sgedmdq_base LAPACK_GLOBAL(sgedmdq,SGEDMDQ)
+void LAPACK_sgedmdq_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    float* f, lapack_int const* ldf,
+    float* x, lapack_int const* ldx,
+    float* y, lapack_int const* ldy, lapack_int const* nrnk,
+    float const* tol, lapack_int const* k,
+    float* reig, float* imeig,
+    float* z, lapack_int const* ldz, float* res,
+    float* b, lapack_int const* ldb,
+    float* v, lapack_int const* ldv,
+    float* s, lapack_int const* lds,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__)
+#endif
+
+#define LAPACK_zgedmdq_base LAPACK_GLOBAL(zgedmdq,ZGEDMDQ)
+void LAPACK_zgedmdq_base(
+    char const* jobs, char const* jobz, char const* jobr, char const* jobq,
+    char const* jobt, char const* jobf, lapack_int const* whtsvd,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* f, lapack_int const* ldf,
+    lapack_complex_double* x, lapack_int const* ldx,
+    lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk,
+    double const* tol, lapack_int const* k,
+    lapack_complex_double* eigs,
+    lapack_complex_double* z, lapack_int const* ldz, double* res,
+    lapack_complex_double* b, lapack_int const* ldb,
+    lapack_complex_double* v, lapack_int const* ldv,
+    lapack_complex_double* s, lapack_int const* lds,
+    lapack_complex_double* zwork, lapack_int const* lzwork,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info
+
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    , size_t, size_t, size_t, size_t, size_t, size_t
+#endif
+);
+#ifdef LAPACK_FORTRAN_STRLEN_END
+    #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1)
+#else
+    #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__)
+#endif
+
 #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
 lapack_int LAPACK_cgesv(
     lapack_int const* n, lapack_int const* nrhs,
@@ -11517,6 +11727,22 @@ void LAPACK_sorgtsqr_row(
     float* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_dorhr_col LAPACK_GLOBAL(dorhr_col,DORHR_COL)
+void LAPACK_dorhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, double* A,
+    lapack_int const* lda, double* T,
+    lapack_int const* ldt, double* D,
+    lapack_int* info );
+
+#define LAPACK_sorhr_col LAPACK_GLOBAL(sorhr_col,SORHR_COL)
+void LAPACK_sorhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, float* A,
+    lapack_int const* lda, float* T,
+    lapack_int const* ldt, float* D,
+    lapack_int* info );
+
 #define LAPACK_dormbr_base LAPACK_GLOBAL(dormbr,DORMBR)
 void LAPACK_dormbr_base(
     char const* vect, char const* side, char const* trans,
@@ -21501,7 +21727,7 @@ void LAPACK_ztrevc_base(
 #endif
 );
 #ifdef LAPACK_FORTRAN_STRLEN_END
-    #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, 1, 1)
+    #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, (size_t)1, 1)
 #else
     #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__)
 #endif
@@ -22708,6 +22934,22 @@ void LAPACK_zungtsqr_row(
     lapack_complex_double* work, lapack_int const* lwork,
     lapack_int* info );
 
+#define LAPACK_cunhr_col LAPACK_GLOBAL(cunhr_col,CUNHR_COL)
+void LAPACK_cunhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, lapack_complex_float* A,
+    lapack_int const* lda, lapack_complex_float* T,
+    lapack_int const* ldt, lapack_complex_float* D,
+    lapack_int* info );
+
+#define LAPACK_zunhr_col LAPACK_GLOBAL(zunhr_col,ZUNHR_COL)
+void LAPACK_zunhr_col(
+    lapack_int const* m, lapack_int const* n,
+    lapack_int const* nb, lapack_complex_double* A,
+    lapack_int const* lda, lapack_complex_double* T,
+    lapack_int const* ldt, lapack_complex_double* D,
+    lapack_int* info );
+
 #define LAPACK_cunmbr_base LAPACK_GLOBAL(cunmbr,CUNMBR)
 void LAPACK_cunmbr_base(
     char const* vect, char const* side, char const* trans,
diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index 9998b1504..377e2a6bb 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -956,7 +956,7 @@ lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, char jobr,
                            lapack_int lda, double* s, lapack_complex_double* u,
                            lapack_int ldu, lapack_complex_double* v,
                            lapack_int ldv, lapack_int* numrank );
-                           
+
 lapack_int LAPACKE_sgesvj( int matrix_layout, char joba, char jobu, char jobv,
                            lapack_int m, lapack_int n, float* a, lapack_int lda,
                            float* sva, lapack_int mv, float* v, lapack_int ldv,
@@ -5712,6 +5712,122 @@ lapack_int LAPACKE_zgesdd_work( int matrix_layout, char jobz, lapack_int m,
                                 lapack_complex_double* work, lapack_int lwork,
                                 double* rwork, lapack_int* iwork );
 
+lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, float* x,
+				lapack_int ldx, float* y, lapack_int ldy,
+				lapack_int nrnk, float* tol, lapack_int k,
+			       	float* reig, float* imeig,
+				float* z, lapack_int ldz, float* res,
+				float* b, lapack_int ldb, float* w,
+				lapack_int ldw, float* s, lapack_int lds,
+				float* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, double* x,
+				lapack_int ldx, double* y, lapack_int ldy,
+				lapack_int nrnk, double* tol, lapack_int k,
+			       	double* reig, double *imeig,
+				double* z, lapack_int ldz, double* res,
+				double* b, lapack_int ldb, double* w,
+				lapack_int ldw, double* s, lapack_int lds,
+				double* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n,
+				lapack_complex_float* x, lapack_int ldx,
+				lapack_complex_float* y, lapack_int ldy,
+				lapack_int nrnk, float* tol, lapack_int k,
+			       	lapack_complex_float* eigs,
+                                lapack_complex_float* z, lapack_int ldz,
+                                float* res,
+                                lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* w, lapack_int ldw,
+                                lapack_complex_float* s, lapack_int lds,
+                                lapack_complex_float* zwork, lapack_int lzwork,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n,
+				lapack_complex_double* x, lapack_int ldx,
+				lapack_complex_double* y, lapack_int ldy,
+				lapack_int nrnk, double* tol, lapack_int k, 
+				lapack_complex_double* eigs,
+                                lapack_complex_double* z, lapack_int ldz,
+                                double* res,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* w, lapack_int ldw,
+                                lapack_complex_double* s, lapack_int lds,
+                                lapack_complex_double* zwork, lapack_int lzwork,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork );
+
+lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 float* f, lapack_int ldf, float* x,
+                                 lapack_int ldx, float* y, lapack_int ldy,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+				 float* reig, float *imeig, float* z,
+                                 lapack_int ldz, float* res, float* b,
+                                 lapack_int ldb, float* v, lapack_int ldv,
+                                 float* s, lapack_int lds, float* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork );
+
+lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 double* f, lapack_int ldf, double* x,
+                                 lapack_int ldx, double* y, lapack_int ldy,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+				 double* reig, double* imeig, double* z,
+                                 lapack_int ldz, double* res, double* b,
+                                 lapack_int ldb, double* v, lapack_int ldv,
+                                 double* s, lapack_int lds, double* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork );
+
+lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_float* f, lapack_int ldf,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 lapack_complex_float* y, lapack_int ldy,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+                                 lapack_complex_float* eigs,
+                                 lapack_complex_float* z, lapack_int ldz,
+                                 float* res,
+                                 lapack_complex_float* b, lapack_int ldb,
+                                 lapack_complex_float* v, lapack_int ldv,
+                                 lapack_complex_float* s, lapack_int lds,
+                                 lapack_complex_float* zwork, lapack_int lzwork,
+                                 float* work, lapack_int lwork,
+                                 lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_double* f, lapack_int ldf,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 lapack_complex_double* y, lapack_int ldy,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+                                 lapack_complex_double* eigs,
+                                 lapack_complex_double* z, lapack_int ldz,
+                                 double* res,
+                                 lapack_complex_double* b, lapack_int ldb,
+                                 lapack_complex_double* v, lapack_int ldv,
+                                 lapack_complex_double* s, lapack_int lds,
+                                 lapack_complex_double* zwork, lapack_int lzwork,
+                                 double* work, lapack_int lwork,
+                                 lapack_int* iwork, lapack_int liwork);
+
 lapack_int LAPACKE_sgesv_work( int matrix_layout, lapack_int n, lapack_int nrhs,
                                float* a, lapack_int lda, lapack_int* ipiv,
                                float* b, lapack_int ldb );
@@ -5833,7 +5949,7 @@ lapack_int LAPACKE_zgesvdq_work( int matrix_layout, char joba, char jobp,
                                 lapack_int* iwork, lapack_int liwork,
                                 lapack_complex_double* cwork, lapack_int lcwork,
                                 double* rwork, lapack_int lrwork);
-                                
+
 lapack_int LAPACKE_sgesvj_work( int matrix_layout, char joba, char jobu,
                                 char jobv, lapack_int m, lapack_int n, float* a,
                                 lapack_int lda, float* sva, lapack_int mv,
@@ -12550,7 +12666,7 @@ lapack_int LAPACKE_zhegv_2stage_work( int matrix_layout, lapack_int itype, char
 //LAPACK 3.8.0
 lapack_int LAPACKE_ssysv_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, float* a, lapack_int lda,
-                          float* tb, lapack_int ltb, lapack_int* ipiv, 
+                          float* tb, lapack_int ltb, lapack_int* ipiv,
                           lapack_int* ipiv2, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssysv_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, float* a, lapack_int lda,
@@ -12560,7 +12676,7 @@ lapack_int LAPACKE_ssysv_aa_2stage_work( int matrix_layout, char uplo, lapack_in
 lapack_int LAPACKE_dsysv_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, double* a, lapack_int lda,
                           double* tb, lapack_int ltb,
-                          lapack_int* ipiv, lapack_int* ipiv2, 
+                          lapack_int* ipiv, lapack_int* ipiv2,
                           double* b, lapack_int ldb );
 lapack_int LAPACKE_dsysv_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, double* a, lapack_int lda,
@@ -12612,10 +12728,10 @@ lapack_int LAPACKE_zhesv_aa_2stage_work( int matrix_layout, char uplo, lapack_in
                                lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2,
                                lapack_complex_double* b, lapack_int ldb,
                                lapack_complex_double* work, lapack_int lwork );
-                               
+
 lapack_int LAPACKE_ssytrf_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           float* a, lapack_int lda,
-                          float* tb, lapack_int ltb, lapack_int* ipiv, 
+                          float* tb, lapack_int ltb, lapack_int* ipiv,
                           lapack_int* ipiv2 );
 lapack_int LAPACKE_ssytrf_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                float* a, lapack_int lda,
@@ -12671,7 +12787,7 @@ lapack_int LAPACKE_zhetrf_aa_2stage_work( int matrix_layout, char uplo, lapack_i
 
 lapack_int LAPACKE_ssytrs_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, float* a, lapack_int lda,
-                          float* tb, lapack_int ltb, lapack_int* ipiv, 
+                          float* tb, lapack_int ltb, lapack_int* ipiv,
                           lapack_int* ipiv2, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, float* a, lapack_int lda,
@@ -12680,7 +12796,7 @@ lapack_int LAPACKE_ssytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
 lapack_int LAPACKE_dsytrs_aa_2stage( int matrix_layout, char uplo, lapack_int n,
                           lapack_int nrhs, double* a, lapack_int lda,
                           double* tb, lapack_int ltb,
-                          lapack_int* ipiv, lapack_int* ipiv2, 
+                          lapack_int* ipiv, lapack_int* ipiv2,
                           double* b, lapack_int ldb );
 lapack_int LAPACKE_dsytrs_aa_2stage_work( int matrix_layout, char uplo, lapack_int n,
                                lapack_int nrhs, double* a, lapack_int lda,
@@ -12727,7 +12843,40 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
                                lapack_int lda, lapack_complex_double* tb,
                                lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2,
                                lapack_complex_double* b, lapack_int ldb );
-                   
+//LAPACK 3.10.0
+lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, float* a,
+                              lapack_int lda, float* t,
+                              lapack_int ldt, float* d );
+lapack_int LAPACKE_sorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, float* a,
+                                   lapack_int lda, float* t,
+                                   lapack_int ldt, float* d );
+lapack_int LAPACKE_dorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, double* a,
+                              lapack_int lda, double* t,
+                              lapack_int ldt, double* d );
+lapack_int LAPACKE_dorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, double* a,
+                                   lapack_int lda, double* t,
+                                   lapack_int ldt, double* d );
+lapack_int LAPACKE_cunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_float* a,
+                              lapack_int lda, lapack_complex_float* t,
+                              lapack_int ldt, lapack_complex_float* d );
+lapack_int LAPACKE_cunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_float* a,
+                                   lapack_int lda, lapack_complex_float* t,
+                                   lapack_int ldt, lapack_complex_float* d );
+lapack_int LAPACKE_zunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_double* a,
+                              lapack_int lda, lapack_complex_double* t,
+                              lapack_int ldt, lapack_complex_double* d );
+lapack_int LAPACKE_zunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_double* a,
+                                   lapack_int lda, lapack_complex_double* t,
+                                   lapack_int ldt, lapack_complex_double* d );
+
 /* APIs for set/get nancheck flags */
 void LAPACKE_set_nancheck( int flag );
 int LAPACKE_get_nancheck( void );
diff --git a/lapack-netlib/LAPACKE/include/lapacke_config.h b/lapack-netlib/LAPACKE/include/lapacke_config.h
index c64fc4416..798a5eb2e 100644
--- a/lapack-netlib/LAPACKE/include/lapacke_config.h
+++ b/lapack-netlib/LAPACKE/include/lapacke_config.h
@@ -68,6 +68,17 @@ extern "C" {
 #endif
 
 #ifndef LAPACK_COMPLEX_CUSTOM
+#if defined(_MSC_VER)
+    #define _CRT_USE_C_COMPLEX_H
+    #include <complex.h>
+    #define LAPACK_COMPLEX_CUSTOM
+    #define lapack_complex_float _Fcomplex
+    #define lapack_complex_double _Dcomplex
+    #define lapack_complex_float_real(z)       (creal(z))
+    #define lapack_complex_float_imag(z)       (cimag(z))
+    #define lapack_complex_double_real(z)       (creal(z))
+    #define lapack_complex_double_imag(z)       (cimag(z))
+#else
 
 #if defined(LAPACK_COMPLEX_STRUCTURE)
 
@@ -109,6 +120,7 @@ typedef struct { double real, imag; } _lapack_complex_double;
 #define lapack_complex_double_real(z)       (creal(z))
 #define lapack_complex_double_imag(z)       (cimag(z))
 
+#endif
 #endif
 
 lapack_complex_float lapack_make_complex_float( float re, float im );
diff --git a/lapack-netlib/LAPACKE/src/CMakeLists.txt b/lapack-netlib/LAPACKE/src/CMakeLists.txt
index 4171a3bd4..89890f923 100644
--- a/lapack-netlib/LAPACKE/src/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/src/CMakeLists.txt
@@ -589,6 +589,8 @@ lapacke_cungtr.c
 lapacke_cungtr_work.c
 lapacke_cungtsqr_row.c
 lapacke_cungtsqr_row_work.c
+lapacke_cunhr_col.c
+lapacke_cunhr_col_work.c
 lapacke_cunmbr.c
 lapacke_cunmbr_work.c
 lapacke_cunmhr.c
@@ -857,6 +859,8 @@ lapacke_dorgtr.c
 lapacke_dorgtr_work.c
 lapacke_dorgtsqr_row.c
 lapacke_dorgtsqr_row_work.c
+lapacke_dorhr_col.c
+lapacke_dorhr_col_work.c
 lapacke_dormbr.c
 lapacke_dormbr_work.c
 lapacke_dormhr.c
@@ -1432,6 +1436,8 @@ lapacke_sorgtr.c
 lapacke_sorgtr_work.c
 lapacke_sorgtsqr_row.c
 lapacke_sorgtsqr_row_work.c
+lapacke_sorhr_col.c
+lapacke_sorhr_col_work.c
 lapacke_sormbr.c
 lapacke_sormbr_work.c
 lapacke_sormhr.c
@@ -2346,6 +2352,8 @@ lapacke_zungtr.c
 lapacke_zungtr_work.c
 lapacke_zungtsqr_row.c
 lapacke_zungtsqr_row_work.c
+lapacke_zunhr_col.c
+lapacke_zunhr_col_work.c
 lapacke_zunmbr.c
 lapacke_zunmbr_work.c
 lapacke_zunmhr.c
diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index 9c02c1445..969288f42 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -28,7 +28,6 @@
 ##############################################################################
 # Contents: Native C interface to LAPACK
 # Author: Intel Corporation
-# September, 2010
 ##############################################################################
 # makefile for LAPACKE, used to build lapacke binary.
 #
@@ -138,6 +137,10 @@ lapacke_cgerqf.o \
 lapacke_cgerqf_work.o \
 lapacke_cgesdd.o \
 lapacke_cgesdd_work.o \
+lapacke_cgedmd.o \
+lapacke_cgedmd_work.o \
+lapacke_cgedmdq.o \
+lapacke_cgedmdq_work.o \
 lapacke_cgesv.o \
 lapacke_cgesv_work.o \
 lapacke_cgesvd.o \
@@ -608,6 +611,8 @@ lapacke_ctrsna.o \
 lapacke_ctrsna_work.o \
 lapacke_ctrsyl.o \
 lapacke_ctrsyl_work.o \
+lapacke_ctrsyl3.o \
+lapacke_ctrsyl3_work.o \
 lapacke_ctrtri.o \
 lapacke_ctrtri_work.o \
 lapacke_ctrtrs.o \
@@ -640,6 +645,8 @@ lapacke_cungtr.o \
 lapacke_cungtr_work.o \
 lapacke_cungtsqr_row.o \
 lapacke_cungtsqr_row_work.o \
+lapacke_cunhr_col.o \
+lapacke_cunhr_col_work.o \
 lapacke_cunmbr.o \
 lapacke_cunmbr_work.o \
 lapacke_cunmhr.o \
@@ -760,6 +767,10 @@ lapacke_dgerqf.o \
 lapacke_dgerqf_work.o \
 lapacke_dgesdd.o \
 lapacke_dgesdd_work.o \
+lapacke_dgedmd.o \
+lapacke_dgedmd_work.o \
+lapacke_dgedmdq.o \
+lapacke_dgedmdq_work.o \
 lapacke_dgesv.o \
 lapacke_dgesv_work.o \
 lapacke_dgesvd.o \
@@ -912,6 +923,8 @@ lapacke_dorgtr.o \
 lapacke_dorgtr_work.o \
 lapacke_dorgtsqr_row.o \
 lapacke_dorgtsqr_row_work.o \
+lapacke_dorhr_col.o \
+lapacke_dorhr_col_work.o \
 lapacke_dormbr.o \
 lapacke_dormbr_work.o \
 lapacke_dormhr.o \
@@ -1224,6 +1237,8 @@ lapacke_dtrsna.o \
 lapacke_dtrsna_work.o \
 lapacke_dtrsyl.o \
 lapacke_dtrsyl_work.o \
+lapacke_dtrsyl3.o \
+lapacke_dtrsyl3_work.o \
 lapacke_dtrtri.o \
 lapacke_dtrtri_work.o \
 lapacke_dtrtrs.o \
@@ -1336,6 +1351,10 @@ lapacke_sgerqf.o \
 lapacke_sgerqf_work.o \
 lapacke_sgesdd.o \
 lapacke_sgesdd_work.o \
+lapacke_sgedmd.o \
+lapacke_sgedmd_work.o \
+lapacke_sgedmdq.o \
+lapacke_sgedmdq_work.o \
 lapacke_sgesv.o \
 lapacke_sgesv_work.o \
 lapacke_sgesvd.o \
@@ -1486,6 +1505,8 @@ lapacke_sorgtr.o \
 lapacke_sorgtr_work.o \
 lapacke_sorgtsqr_row.o \
 lapacke_sorgtsqr_row_work.o \
+lapacke_sorhr_col.o \
+lapacke_sorhr_col_work.o \
 lapacke_sormbr.o \
 lapacke_sormbr_work.o \
 lapacke_sormhr.o \
@@ -1794,6 +1815,8 @@ lapacke_strsna.o \
 lapacke_strsna_work.o \
 lapacke_strsyl.o \
 lapacke_strsyl_work.o \
+lapacke_strsyl3.o \
+lapacke_strsyl3_work.o \
 lapacke_strtri.o \
 lapacke_strtri_work.o \
 lapacke_strtrs.o \
@@ -1902,6 +1925,10 @@ lapacke_zgerqf.o \
 lapacke_zgerqf_work.o \
 lapacke_zgesdd.o \
 lapacke_zgesdd_work.o \
+lapacke_zgedmd.o \
+lapacke_zgedmd_work.o \
+lapacke_zgedmdq.o \
+lapacke_zgedmdq_work.o \
 lapacke_zgesv.o \
 lapacke_zgesv_work.o \
 lapacke_zgesvd.o \
@@ -2372,6 +2399,8 @@ lapacke_ztrsna.o \
 lapacke_ztrsna_work.o \
 lapacke_ztrsyl.o \
 lapacke_ztrsyl_work.o \
+lapacke_ztrsyl3.o \
+lapacke_ztrsyl3_work.o \
 lapacke_ztrtri.o \
 lapacke_ztrtri_work.o \
 lapacke_ztrtrs.o \
@@ -2404,6 +2433,8 @@ lapacke_zungtr.o \
 lapacke_zungtr_work.o \
 lapacke_zungtsqr_row.o \
 lapacke_zungtsqr_row_work.o \
+lapacke_zunhr_col.o \
+lapacke_zunhr_col_work.o \
 lapacke_zunmbr.o \
 lapacke_zunmbr_work.o \
 lapacke_zunmhr.o \
@@ -2544,7 +2575,7 @@ $(LAPACKELIB): $(OBJ) $(OBJ_S) $(OBJ_C) $(OBJ_D) $(OBJ_Z) $(DEPRECATED) $(EXTEND
 ifdef BUILD_DEPRECATED
 	$(AR) $(ARFLAGS) $@ $(DEPRECATED)
 endif
-ifdef (USEXBLAS)
+ifdef USEXBLAS
 	$(AR) $(ARFLAGS) $@ $(EXTENDED)
 endif
 ifdef LAPACKE_WITH_TMG
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
new file mode 100644
index 000000000..6c77e199e
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c
@@ -0,0 +1,127 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+		 	   lapack_int n, lapack_complex_float* x,
+			   lapack_int ldx, lapack_complex_float* y,
+			   lapack_int ldy, lapack_int nrnk, float* tol,
+			   lapack_int k, lapack_complex_float* eigs,
+                           lapack_complex_float* z, lapack_int ldz,
+                           float* res, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* w,
+                           lapack_int ldw, lapack_complex_float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_int lzwork = -1;
+    lapack_complex_float* zwork = NULL;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    lapack_complex_float zwork_query;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -20;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &zwork_query, 
+				lzwork, &work_query, lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lzwork  = LAPACK_C2INT( zwork_query );
+    lwork  = LAPACK_C2INT( work_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    zwork  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork );
+    if( zwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work  = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_2;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, zwork, lzwork,
+				work, lwork, iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_2:
+    LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
new file mode 100644
index 000000000..08d8b91f5
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c
@@ -0,0 +1,184 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd, lapack_int m,
+                                lapack_int n, lapack_complex_float* x, lapack_int ldx,
+                                lapack_complex_float* y, lapack_int ldy, lapack_int nrnk,
+                                float* tol, lapack_int k, lapack_complex_float* eigs,
+                                lapack_complex_float* z, lapack_int ldz,
+                                float* res, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* w,
+                                lapack_int ldw, lapack_complex_float* s, lapack_int lds,
+                                lapack_complex_float* zwork, lapack_int lzwork,
+                                float* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y,
+ 	               &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w, &ldw,
+		       s, &lds, zwork, &lzwork, work, &lwork, iwork, &liwork,
+		       &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_float* x_t = NULL;
+        lapack_complex_float* y_t = NULL;
+        lapack_complex_float* z_t = NULL;
+        lapack_complex_float* b_t = NULL;
+        lapack_complex_float* w_t = NULL;
+        lapack_complex_float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x,
+			   &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b,
+			   &ldb, w, &ldw, s, &lds, zwork, &lzwork, 
+			   work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t,
+		       &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t,
+		       res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork,
+		       &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
new file mode 100644
index 000000000..b0b258f97
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c
@@ -0,0 +1,133 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, lapack_complex_float* f,
+                            lapack_int ldf, lapack_complex_float* x,
+                            lapack_int ldx, lapack_complex_float* y,
+                            lapack_int ldy, lapack_int nrnk, float* tol,
+                            lapack_int k, lapack_complex_float* eigs,
+                            lapack_complex_float* z, lapack_int ldz,
+                            float* res, lapack_complex_float* b,
+                            lapack_int ldb, lapack_complex_float* v,
+                            lapack_int ldv, lapack_complex_float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_int lzwork = -1;
+    lapack_complex_float* zwork = NULL;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    lapack_complex_float zwork_query;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &zwork_query, lzwork,
+				 &work_query, lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lzwork  = LAPACK_C2INT( zwork_query );
+    lwork  = LAPACK_C2INT( work_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    zwork  = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork );
+    if( zwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work  = (float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_2;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, zwork, lzwork,
+				 work, lwork, iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_2:
+    LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
new file mode 100644
index 000000000..05287c1bc
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c
@@ -0,0 +1,205 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_float* f, lapack_int ldf,
+                                 lapack_complex_float* x, lapack_int ldx,
+                                 lapack_complex_float* y, lapack_int ldy,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+                                 lapack_complex_float* eigs,
+                                 lapack_complex_float* z,
+                                 lapack_int ldz, float* res,
+                                 lapack_complex_float* b,
+                                 lapack_int ldb, lapack_complex_float* v,
+                                 lapack_int ldv, lapack_complex_float* s,
+                                 lapack_int lds, lapack_complex_float *zwork,
+				 lapack_int lzwork, float* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_float* f_t = NULL;
+        lapack_complex_float* x_t = NULL;
+        lapack_complex_float* y_t = NULL;
+        lapack_complex_float* z_t = NULL;
+        lapack_complex_float* b_t = NULL;
+        lapack_complex_float* v_t = NULL;
+        lapack_complex_float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lzwork == -1 || lwork == -1 || liwork == -1 ) {
+            LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c
index 962624d21..3f8f0cf17 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgeqrt_work.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_cgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
index ed12b476e..aac7b551d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_clarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_clarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
index 545769b83..67bbbd34f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clarfb_work.c
@@ -60,7 +60,7 @@ lapack_int LAPACKE_clarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c
new file mode 100644
index 000000000..7ed1ad4c4
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_float* a,
+                              lapack_int lda, lapack_complex_float* t,
+                              lapack_int ldt, lapack_complex_float* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_cunhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_cunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c
new file mode 100644
index 000000000..76b8366f0
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_float* a,
+                                   lapack_int lda, lapack_complex_float* t,
+                                   lapack_int ldt, lapack_complex_float* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        lapack_complex_float* a_t = NULL;
+        lapack_complex_float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_float*)
+            LAPACKE_malloc( sizeof(lapack_complex_float) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_cunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cunhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
new file mode 100644
index 000000000..6802378da
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c
@@ -0,0 +1,114 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+			   lapack_int n, double* x, lapack_int ldx, double* y,
+			   lapack_int ldy, lapack_int nrnk, double* tol, 
+			   lapack_int k, double* reig, double* imeig,
+			   double* z, lapack_int ldz,
+			   double* res, double* b, lapack_int ldb,
+                           double* w, lapack_int ldw, double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -20;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &work_query,
+				lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, work, lwork,
+				iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
new file mode 100644
index 000000000..987709a1b
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c
@@ -0,0 +1,181 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, double* x,
+				lapack_int ldx, double* y, lapack_int ldy,
+				lapack_int nrnk, double* tol, lapack_int k,
+			       	double* reig, double* imeig,
+				double* z, lapack_int ldz, double* res,
+				double* b, lapack_int ldb, double* w,
+				lapack_int ldw, double* s, lapack_int lds,
+				double* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y,
+		       &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw,
+		       s, &lds, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        double* x_t = NULL;
+        double* y_t = NULL;
+        double* z_t = NULL;
+        double* b_t = NULL;
+        double* w_t = NULL;
+        double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx,
+			   y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w,
+			   &ldw, s, &lds, work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (double*)LAPACKE_malloc( sizeof(double) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, &ldx_t,
+		       y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t, res, b_t,
+		       &ldb_t, w_t, &ldw_t, s_t, &lds_t, work, &lwork,
+		       iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
new file mode 100644
index 000000000..5c3c39308
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c
@@ -0,0 +1,119 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, double* f, lapack_int ldf,
+                            double* x, lapack_int ldx, double* y, lapack_int ldy,
+                            lapack_int nrnk, double* tol, lapack_int k,
+                            double* reig, double* imeig, double* z,
+                            lapack_int ldz, double* res, double* b, lapack_int ldb,
+                            double* v, lapack_int ldv, double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
+                                 &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
+                                 liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
new file mode 100644
index 000000000..149e6d24f
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c
@@ -0,0 +1,200 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 double* f, lapack_int ldf, double* x,
+                                 lapack_int ldx, double* y, lapack_int ldy,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+                                 double* reig, double* imeig, double* z,
+                                 lapack_int ldz, double* res, double* b,
+                                 lapack_int ldb, double* v, lapack_int ldv,
+                                 double* s, lapack_int lds, double* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        double* f_t = NULL;
+        double* x_t = NULL;
+        double* y_t = NULL;
+        double* z_t = NULL;
+        double* b_t = NULL;
+        double* v_t = NULL;
+        double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (double*)LAPACKE_malloc( sizeof(double) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (double*)LAPACKE_malloc( sizeof(double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (double*)LAPACKE_malloc( sizeof(double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (double*)LAPACKE_malloc( sizeof(double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (double*)LAPACKE_malloc( sizeof(double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (double*)LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (double*)LAPACKE_malloc( sizeof(double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
index 7796edffc..0460b6406 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
@@ -80,10 +80,13 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort,
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
         iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
-        if( iwork == NULL ) {
-            info = LAPACK_WORK_MEMORY_ERROR;
-            goto exit_level_1;
-        }
+    }
+    else {
+        iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 );
+    }
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
     }
     work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
     if( work == NULL ) {
@@ -97,9 +100,7 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort,
     /* Release memory and exit */
     LAPACKE_free( work );
 exit_level_2:
-    if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
-        LAPACKE_free( iwork );
-    }
+    LAPACKE_free( iwork );
 exit_level_1:
     if( LAPACKE_lsame( sort, 's' ) ) {
         LAPACKE_free( bwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c
index 8a4c7cead..000c94e0b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgeqrt_work.c
@@ -80,7 +80,7 @@ lapack_int LAPACKE_dgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
index f4ddc62a5..aeebd8dec 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb.c
@@ -57,7 +57,7 @@ lapack_int LAPACKE_dlarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_dlarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
index de444c146..de2f41e66 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlarfb_work.c
@@ -59,7 +59,7 @@ lapack_int LAPACKE_dlarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c
new file mode 100644
index 000000000..1f37725e9
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, double* a,
+                              lapack_int lda, double* t,
+                              lapack_int ldt, double* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_dorhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_dorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c
new file mode 100644
index 000000000..28b80cc02
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dorhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, double* a,
+                                   lapack_int lda, double* t,
+                                   lapack_int ldt, double* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        double* a_t = NULL;
+        double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)
+            LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (double*)
+            LAPACKE_malloc( sizeof(double) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_dorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dorhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_nancheck.c b/lapack-netlib/LAPACKE/src/lapacke_nancheck.c
index c7d5c33f1..bb894f351 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_nancheck.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_nancheck.c
@@ -39,7 +39,7 @@ void LAPACKE_set_nancheck( int flag )
     nancheck_flag = ( flag ) ? 1 : 0;
 }
 
-int LAPACKE_get_nancheck( )
+int LAPACKE_get_nancheck( void )
 {
     char* env;
     if ( nancheck_flag != -1 ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
new file mode 100644
index 000000000..6865fcf65
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c
@@ -0,0 +1,114 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+			   lapack_int n, float* x, lapack_int ldx, float* y,
+			   lapack_int ldy, lapack_int nrnk, float* tol, 
+			   lapack_int k, float* reig, float* imeig,
+			   float* z, lapack_int ldz, float* res,
+			   float* b, lapack_int ldb, float* w, lapack_int ldw,
+			   float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -20;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &work_query,
+				lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz,
+				res, b, ldb, w, ldw, s, lds, work, lwork,
+				iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
new file mode 100644
index 000000000..5b24152da
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c
@@ -0,0 +1,182 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz,
+				char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n, float* x,
+				lapack_int ldx, float* y, lapack_int ldy,
+				lapack_int nrnk, float* tol, lapack_int k, 
+				float* reig, float* imeig,
+				float* z, lapack_int ldz, float* res,
+				float* b, lapack_int ldb, float* w,
+				lapack_int ldw, float* s, lapack_int lds,
+				float* work, lapack_int lwork,
+				lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y,
+		       &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw,
+		       s, &lds, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        float* x_t = NULL;
+        float* y_t = NULL;
+        float* z_t = NULL;
+        float* b_t = NULL;
+        float* w_t = NULL;
+        float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x,
+			   &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b,
+			   &ldb, w, &ldw, s, &lds, work, &lwork, iwork,
+			   &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (float*)LAPACKE_malloc( sizeof(float) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t,
+	    	       &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t,
+		       res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, work,
+		       &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
new file mode 100644
index 000000000..e65c2094f
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c
@@ -0,0 +1,119 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, float* f, lapack_int ldf,
+                            float* x, lapack_int ldx, float* y, lapack_int ldy,
+                            lapack_int nrnk, float* tol, lapack_int k,
+                            float* reig, float* imeig, float* z,
+                            lapack_int ldz, float* res, float* b, lapack_int ldb,
+                            float* v, lapack_int ldv, float* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    float* work = NULL;
+    lapack_int* iwork = NULL;
+    float work_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &work_query, lwork,
+                                 &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = (lapack_int) work_query;
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    work  = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, reig, imeig, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, work, lwork, iwork,
+                                 liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_1:
+    LAPACKE_free( work );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
new file mode 100644
index 000000000..e1c1f5c98
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c
@@ -0,0 +1,200 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 float* f, lapack_int ldf, float* x,
+                                 lapack_int ldx, float* y, lapack_int ldy,
+                                 lapack_int nrnk, float* tol, lapack_int k,
+                                 float* reig, float* imeig, float* z,
+                                 lapack_int ldz, float* res, float* b,
+                                 lapack_int ldb, float* v, lapack_int ldv,
+                                 float* s, lapack_int lds, float* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        float* f_t = NULL;
+        float* x_t = NULL;
+        float* y_t = NULL;
+        float* z_t = NULL;
+        float* b_t = NULL;
+        float* v_t = NULL;
+        float* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (float*)LAPACKE_malloc( sizeof(float) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (float*)LAPACKE_malloc( sizeof(float) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (float*)LAPACKE_malloc( sizeof(float) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (float*)LAPACKE_malloc( sizeof(float) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (float*)LAPACKE_malloc( sizeof(float) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (float*)LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (float*)LAPACKE_malloc( sizeof(float) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
index f7be44297..d2555ecc8 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
@@ -80,10 +80,13 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort,
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
         iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
-        if( iwork == NULL ) {
-            info = LAPACK_WORK_MEMORY_ERROR;
-            goto exit_level_1;
-        }
+    }
+    else {
+        iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * 1 );
+    }
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
     }
     work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
     if( work == NULL ) {
@@ -97,9 +100,7 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort,
     /* Release memory and exit */
     LAPACKE_free( work );
 exit_level_2:
-    if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
-        LAPACKE_free( iwork );
-    }
+    LAPACKE_free( iwork );
 exit_level_1:
     if( LAPACKE_lsame( sort, 's' ) ) {
         LAPACKE_free( bwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c
index 00ca6e501..d357845ae 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgeqrt_work.c
@@ -79,7 +79,7 @@ lapack_int LAPACKE_sgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
index d36958f93..3d6c29f88 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb.c
@@ -57,7 +57,7 @@ lapack_int LAPACKE_slarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_slarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
index 8b6127633..72a392a77 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slarfb_work.c
@@ -59,7 +59,7 @@ lapack_int LAPACKE_slarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c
new file mode 100644
index 000000000..60e6e7951
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, float* a,
+                              lapack_int lda, float* t,
+                              lapack_int ldt, float* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_sorhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_sorhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c
new file mode 100644
index 000000000..56d6a965e
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sorhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sorhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, float* a,
+                                   lapack_int lda, float* t,
+                                   lapack_int ldt, float* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        float* a_t = NULL;
+        float* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (float*)
+            LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (float*)
+            LAPACKE_malloc( sizeof(float) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_sorhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sorhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
new file mode 100644
index 000000000..e4ea4fe10
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c
@@ -0,0 +1,128 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobr,
+			   char jobf, lapack_int whtsvd, lapack_int m,
+			   lapack_int n, lapack_complex_double* x,
+			   lapack_int ldx, lapack_complex_double* y,
+			   lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k,
+                           lapack_complex_double* eigs, lapack_complex_double* z,
+                           lapack_int ldz, double* res,
+                           lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* zw, lapack_int lzw,
+                           lapack_complex_double* w, lapack_int ldw,
+                           lapack_complex_double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_int lzwork = -1;
+    lapack_complex_double* zwork = NULL;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_int iwork_query;
+    lapack_complex_double zwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmd", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -8;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -10;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -15;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -18;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -20;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, w, ldw ) ) {
+            return -22;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, &zwork_query, lzwork,
+				&work_query, lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = LAPACK_Z2INT( work_query );
+    liwork = iwork_query;
+    lzwork = LAPACK_Z2INT( zwork_query );
+    /* Allocate memory for work arrays */
+    zwork  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork );
+    if( zwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work  = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_2;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd,
+				m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz,
+				res, b, ldb, w, ldw, s, lds, zwork, lzwork,
+				work, lwork, iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_2:
+    LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmd", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
new file mode 100644
index 000000000..ebacfaa94
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c
@@ -0,0 +1,186 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zgedmd
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz,
+                                char jobr, char jobf, lapack_int whtsvd,
+				lapack_int m, lapack_int n,
+				lapack_complex_double* x,
+                                lapack_int ldx, lapack_complex_double* y,
+                                lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k,
+                                lapack_complex_double* eigs, lapack_complex_double* z,
+                                lapack_int ldz, double* res,
+                                lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* w, lapack_int ldw,
+                                lapack_complex_double* s, lapack_int lds,
+                                lapack_complex_double* zwork, lapack_int lzwork,
+                                double* work, lapack_int lwork,
+                                lapack_int* iwork, lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx,
+		       y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w,
+		       &ldw, s, &lds, zwork, &lzwork, work, &lwork, iwork, 
+		       &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldw_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_double* x_t = NULL;
+        lapack_complex_double* y_t = NULL;
+        lapack_complex_double* z_t = NULL;
+        lapack_complex_double* b_t = NULL;
+        lapack_complex_double* w_t = NULL;
+        lapack_complex_double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldx < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -11;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -19;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( ldw < n ) {
+            info = -21;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x,
+			   &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b,
+			   &ldb, w, &ldw, s, &lds, zwork, &lzwork, work, 
+			   &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        w_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldw_t * MAX(1,n) );
+        if( w_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t,
+		       &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t,
+		       res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork,
+		       &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, w_t, ldw_t, w, ldw );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_5:
+        LAPACKE_free( w_t );
+exit_level_4:
+        LAPACKE_free( b_t );
+exit_level_3:
+        LAPACKE_free( z_t );
+exit_level_2:
+        LAPACKE_free( y_t );
+exit_level_1:
+        LAPACKE_free( x_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zgedmd_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
new file mode 100644
index 000000000..368d48e20
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c
@@ -0,0 +1,133 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr,
+                            char jobq, char jobt, char jobf, lapack_int whtsvd,
+                            lapack_int m, lapack_int n, lapack_complex_double* f,
+                            lapack_int ldf, lapack_complex_double* x,
+                            lapack_int ldx, lapack_complex_double* y,
+                            lapack_int ldy, lapack_int nrnk, double* tol,
+                            lapack_int k, lapack_complex_double* eigs,
+                            lapack_complex_double* z, lapack_int ldz,
+                            double* res, lapack_complex_double* b,
+                            lapack_int ldb, lapack_complex_double* v,
+                            lapack_int ldv, lapack_complex_double* s, lapack_int lds)
+{
+    lapack_int info = 0;
+    lapack_int lwork = -1;
+    lapack_int liwork = -1;
+    lapack_int lzwork = -1;
+    lapack_complex_double* zwork = NULL;
+    double* work = NULL;
+    lapack_int* iwork = NULL;
+    double work_query;
+    lapack_complex_double zwork_query;
+    lapack_int iwork_query;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, f, ldf ) ) {
+            return -11;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, x, ldx ) ) {
+            return -13;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, y, ldy ) ) {
+            return -15;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, z, ldz ) ) {
+            return -22;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) {
+            return -25;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, v, ldv ) ) {
+            return -27;
+        }
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, s, lds ) ) {
+            return -29;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, &zwork_query, lzwork, 
+				 &work_query, lwork, &iwork_query, liwork );
+
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    lwork  = LAPACK_Z2INT( work_query );
+    lzwork  = LAPACK_Z2INT( zwork_query );
+    liwork = iwork_query;
+    /* Allocate memory for work arrays */
+    zwork  = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork );
+    if( zwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work  = (double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_1;
+    }
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_2;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt,
+                                 jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy,
+                                 nrnk, tol, k, eigs, z, ldz, res,
+                                 b, ldb, v, ldv, s, lds, zwork, lzwork, 
+				 work, lwork, iwork, liwork );
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+exit_level_2:
+    LAPACKE_free( work );
+exit_level_1:
+    LAPACKE_free( zwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zgedmdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c
new file mode 100644
index 000000000..131e4f9ad
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c
@@ -0,0 +1,205 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zgedmdq
+* Author: Intel Corporation
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz,
+                                 char jobr, char jobq, char jobt, char jobf,
+                                 lapack_int whtsvd, lapack_int m, lapack_int n,
+                                 lapack_complex_double* f, lapack_int ldf,
+                                 lapack_complex_double* x, lapack_int ldx,
+                                 lapack_complex_double* y, lapack_int ldy,
+                                 lapack_int nrnk, double* tol, lapack_int k,
+                                 lapack_complex_double* eigs,
+                                 lapack_complex_double* z,
+                                 lapack_int ldz, double* res,
+                                 lapack_complex_double* b,
+                                 lapack_int ldb, lapack_complex_double* v,
+                                 lapack_int ldv, lapack_complex_double* s,
+                                 lapack_int lds, lapack_complex_double* zwork,
+				 lapack_int lzwork, double* work,
+                                 lapack_int lwork, lapack_int* iwork,
+                                 lapack_int liwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int ldf_t = MAX(1,m);
+        lapack_int ldx_t = MAX(1,m);
+        lapack_int ldy_t = MAX(1,m);
+        lapack_int ldz_t = MAX(1,m);
+        lapack_int ldb_t = MAX(1,m);
+        lapack_int ldv_t = MAX(1,m);
+        lapack_int lds_t = MAX(1,m);
+        lapack_complex_double* f_t = NULL;
+        lapack_complex_double* x_t = NULL;
+        lapack_complex_double* y_t = NULL;
+        lapack_complex_double* z_t = NULL;
+        lapack_complex_double* b_t = NULL;
+        lapack_complex_double* v_t = NULL;
+        lapack_complex_double* s_t = NULL;
+        /* Check leading dimension(s) */
+        if( ldf < n ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldx < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldy < n ) {
+            info = -16;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldz < n ) {
+            info = -23;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldb < n ) {
+            info = -26;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -28;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        if( lds < n ) {
+            info = -30;
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 || liwork == -1 ) {
+            LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                            &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                            z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                            zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        f_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldf_t * MAX(1,n) );
+        if( f_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        x_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldx_t * MAX(1,n) );
+        if( x_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        y_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldy_t * MAX(1,n) );
+        if( y_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_2;
+        }
+        z_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldz_t * MAX(1,n) );
+        if( z_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_3;
+        }
+        b_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldb_t * MAX(1,n) );
+        if( b_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_4;
+        }
+        v_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * ldv_t * MAX(1,n) );
+        if( v_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_5;
+        }
+        s_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lds_t * MAX(1,n) );
+        if( s_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_6;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, f, ldf, f_t, ldf_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, x, ldx, x_t, ldx_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, y, ldy, y_t, ldy_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, z, ldz, z_t, ldz_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, b, ldb, b_t, ldb_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, v, ldv, v_t, ldv_t );
+        LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m,
+                        &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs,
+                        z, &ldz, res, b, &ldb, v, &ldv, s, &lds,
+                        zwork, &lzwork, work, &lwork, iwork, &liwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, f_t, ldf_t, f, ldf );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, x_t, ldx_t, x, ldx );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, y_t, ldy_t, y, ldy );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, z_t, ldz_t, z, ldz );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, b_t, ldb_t, b, ldb );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, v_t, ldv_t, v, ldv );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, s_t, lds_t, s, lds );
+        /* Release memory and exit */
+        LAPACKE_free( s_t );
+exit_level_6:
+        LAPACKE_free( v_t );
+exit_level_5:
+        LAPACKE_free( b_t );
+exit_level_4:
+        LAPACKE_free( z_t );
+exit_level_3:
+        LAPACKE_free( y_t );
+exit_level_2:
+        LAPACKE_free( x_t );
+exit_level_1:
+        LAPACKE_free( f_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zgedmdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c
index 8f6f1ed0b..633111533 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgeqrt_work.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_zgeqrt_work( int matrix_layout, lapack_int m, lapack_int n,
         }
         /* Transpose output matrices */
         LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
-        LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, MIN(m,n), t_t, ldt_t, t,
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb, MIN(m,n), t_t, ldt_t, t,
                            ldt );
         /* Release memory and exit */
         LAPACKE_free( t_t );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
index 85355b202..c5edbbc0e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_zlarfb( int matrix_layout, char side, char trans, char direct
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         if( ( col && k > nrows_v ) || ( !col && k > ncols_v ) ) {
             LAPACKE_xerbla( "LAPACKE_zlarfb", -8 );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
index 72d85ec82..232c8ef58 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlarfb_work.c
@@ -60,7 +60,7 @@ lapack_int LAPACKE_zlarfb_work( int matrix_layout, char side, char trans,
 
         nrows_v = ( col && left ) ? m : ( ( col && !left ) ? n : ( !col ? k : 1) );
         ncols_v = ( !col && left ) ? m : ( ( !col && !left ) ? n : ( col ? k : 1 ) );
-        uplo = ( ( left && col ) || !( left || col ) ) ? 'l' : 'u';
+        uplo = ( ( forward && col ) || !( forward || col ) ) ? 'l' : 'u';
 
         ldc_t = MAX(1,m);
         ldt_t = MAX(1,k);
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c
new file mode 100644
index 000000000..7e2507daf
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col.c
@@ -0,0 +1,24 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zunhr_col( int matrix_layout, lapack_int m, lapack_int n,
+                              lapack_int nb, lapack_complex_double* a,
+                              lapack_int lda, lapack_complex_double* t,
+                              lapack_int ldt, lapack_complex_double* d)
+{
+  lapack_int info = 0;
+  if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+    LAPACKE_xerbla( "LAPACKE_zunhr_col", -1 );
+    return -1;
+  }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+  if( LAPACKE_get_nancheck() ) {
+    /* Optionally check input matrices for NaNs */
+    if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
+      return -5;
+    }
+  }
+#endif
+  /* Call middle-level interface */
+  info = LAPACKE_zunhr_col_work( matrix_layout, m, n, nb, a, lda, t, ldt, d );
+  return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c
new file mode 100644
index 000000000..b5e640177
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zunhr_col_work.c
@@ -0,0 +1,69 @@
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zunhr_col_work( int matrix_layout, lapack_int m, lapack_int n,
+                                   lapack_int nb, lapack_complex_double* a,
+                                   lapack_int lda, lapack_complex_double* t,
+                                   lapack_int ldt, lapack_complex_double* d )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zunhr_col( &m, &n, &nb, a, &lda, t, &ldt, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldt_t = MAX(1,MIN(nb,n));
+        lapack_complex_double* a_t = NULL;
+        lapack_complex_double* t_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -6;
+            LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+            return info;
+        }
+        if( ldt < n ) {
+            info = -8;
+            LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+            return info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        t_t = (lapack_complex_double*)
+            LAPACKE_malloc( sizeof(lapack_complex_double) *
+                            ldt_t * MAX(1,n) );
+        if( t_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_1;
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+        LAPACK_zunhr_col( &m, &n, &nb, a_t, &lda_t, t_t, &ldt_t, d, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, ldt, n, t_t, ldt_t, t,
+                           ldt );
+        /* Release memory and exit */
+        LAPACKE_free( t_t );
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zunhr_col_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c
index 2adf71493..b32fc2f9e 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_cgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation.
+         * This is just reference implementation.
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c
index 6624936a6..89e421eae 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation.
+         * This is just reference implementation.
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c
index 046fdb9ca..a90c9617a 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_sgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation.
+         * This is just reference implementation.
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c
index 32a9de379..63323b1c3 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_zgb_trans.c
@@ -54,7 +54,7 @@ void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n,
         }
     } else if ( matrix_layout == LAPACK_ROW_MAJOR ) {
         /* TODO: interchange loops for performance.
-         * This is just reference impemeltation
+         * This is just reference implementation
          */
         for( j = 0; j < MIN( n, ldin ); j++ ) {
             for( i = MAX( ku-j, 0 ); i < MIN3( ldout, m+ku-j, kl+ku+1 );
diff --git a/lapack-netlib/LICENSE b/lapack-netlib/LICENSE
index 94cdb0f85..96b04c988 100644
--- a/lapack-netlib/LICENSE
+++ b/lapack-netlib/LICENSE
@@ -1,9 +1,9 @@
-Copyright (c) 1992-2017 The University of Tennessee and The University
+Copyright (c) 1992-2023 The University of Tennessee and The University
                         of Tennessee Research Foundation.  All rights
                         reserved.
-Copyright (c) 2000-2017 The University of California Berkeley. All
+Copyright (c) 2000-2023 The University of California Berkeley. All
                         rights reserved.
-Copyright (c) 2006-2017 The University of Colorado Denver.  All rights
+Copyright (c) 2006-2023 The University of Colorado Denver.  All rights
                         reserved.
 
 $COPYRIGHT$
diff --git a/lapack-netlib/README.md b/lapack-netlib/README.md
index 142aa7b72..a00d4c51d 100644
--- a/lapack-netlib/README.md
+++ b/lapack-netlib/README.md
@@ -6,7 +6,7 @@
 [![Appveyor](https://ci.appveyor.com/api/projects/status/bh38iin398msrbtr?svg=true)](https://ci.appveyor.com/project/langou/lapack/)
 [![codecov](https://codecov.io/gh/Reference-LAPACK/lapack/branch/master/graph/badge.svg)](https://codecov.io/gh/Reference-LAPACK/lapack)
 [![Packaging status](https://repology.org/badge/tiny-repos/lapack.svg)](https://repology.org/metapackage/lapack/versions)
-
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/Reference-LAPACK/lapack/badge)](https://securityscorecards.dev/viewer/?uri=github.com/Reference-LAPACK/lapack)
 
 * VERSION 1.0   :  February 29, 1992
 * VERSION 1.0a  :  June 30, 1992
@@ -37,6 +37,7 @@
 * VERSION 3.10.0 : June 2021
 * VERSION 3.10.1 : April 2022
 * VERSION 3.11.0 : November 2022
+* VERSION 3.12.0 : November 2023
 
 LAPACK is a library of Fortran subroutines for solving the most commonly
 occurring problems in numerical linear algebra.
diff --git a/lapack-netlib/SRC/DEPRECATED/cgegs.c b/lapack-netlib/SRC/DEPRECATED/cgegs.c
index 35b59e683..270a05ebd 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegs.c
+++ b/lapack-netlib/SRC/DEPRECATED/cgegs.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/cgegs.f b/lapack-netlib/SRC/DEPRECATED/cgegs.f
index 1f0791a20..b6adf9111 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/cgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> CGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/cgegv.c b/lapack-netlib/SRC/DEPRECATED/cgegv.c
index 5a5ea919f..d485b05b6 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegv.c
+++ b/lapack-netlib/SRC/DEPRECATED/cgegv.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/cgegv.f b/lapack-netlib/SRC/DEPRECATED/cgegv.f
index ba810ddef..d2b254255 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/cgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> CGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> CGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/cgelqs.c b/lapack-netlib/SRC/DEPRECATED/cgelqs.c
new file mode 100644
index 000000000..ee6d56119
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/cgelqs.c
@@ -0,0 +1,479 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static complex c_b1 = {0.f,0.f};
+static complex c_b2 = {1.f,0.f};
+
+/* > \brief \b CGELQS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX            A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Compute a minimum-norm solution */
+/* >     f2cmin || A*X - B || */
+/* > using the LQ factorization */
+/* >     A = L*Q */
+/* > computed by CGELQF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX array, dimension (LDA,N) */
+/* >          Details of the LQ factorization of the original matrix A as */
+/* >          returned by CGELQF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is COMPLEX array, dimension (M) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= N. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int cgelqs_(integer *m, integer *n, integer *nrhs, complex *
+	a, integer *lda, complex *tau, complex *b, integer *ldb, complex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *), claset_(char *, 
+	    integer *, integer *, complex *, complex *, complex *, integer *), xerbla_(char *, integer *), cunmlq_(char *, char 
+	    *, integer *, integer *, integer *, complex *, integer *, complex 
+	    *, complex *, integer *, complex *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *m > *n) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGELQS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Solve L*X = B(1:m,:) */
+
+    ctrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b2, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+/*     Set B(m+1:n,:) to zero */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	claset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     B := Q' * B */
+
+    cunmlq_("Left", "Conjugate transpose", n, nrhs, m, &a[a_offset], lda, &
+	    tau[1], &b[b_offset], ldb, &work[1], lwork, info);
+
+    return 0;
+
+/*     End of CGELQS */
+
+} /* cgelqs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/cgelqs.f b/lapack-netlib/SRC/DEPRECATED/cgelqs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/cgelqs.f
rename to lapack-netlib/SRC/DEPRECATED/cgelqs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/cgelsx.c b/lapack-netlib/SRC/DEPRECATED/cgelsx.c
index bdc395acf..cb3c33323 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgelsx.c
+++ b/lapack-netlib/SRC/DEPRECATED/cgelsx.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/cgeqpf.c b/lapack-netlib/SRC/DEPRECATED/cgeqpf.c
index 6c810fc79..12f8e5c76 100644
--- a/lapack-netlib/SRC/DEPRECATED/cgeqpf.c
+++ b/lapack-netlib/SRC/DEPRECATED/cgeqpf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/cgeqrs.c b/lapack-netlib/SRC/DEPRECATED/cgeqrs.c
new file mode 100644
index 000000000..c71b8af67
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/cgeqrs.c
@@ -0,0 +1,471 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static complex c_b1 = {1.f,0.f};
+
+/* > \brief \b CGEQRS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX            A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Solve the least squares problem */
+/* >     f2cmin || A*X - B || */
+/* > using the QR factorization */
+/* >     A = Q*R */
+/* > computed by CGEQRF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  M >= N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX array, dimension (LDA,N) */
+/* >          Details of the QR factorization of the original matrix A as */
+/* >          returned by CGEQRF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is COMPLEX array, dimension (N) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int cgeqrs_(integer *m, integer *n, integer *nrhs, complex *
+	a, integer *lda, complex *tau, complex *b, integer *ldb, complex *
+	work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int ctrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *), xerbla_(char *, 
+	    integer *), cunmqr_(char *, char *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, complex *, integer *, 
+	    complex *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*m)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEQRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     B := Q' * B */
+
+    cunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, &
+	    tau[1], &b[b_offset], ldb, &work[1], lwork, info);
+
+/*     Solve R*X = B(1:n,:) */
+
+    ctrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+    return 0;
+
+/*     End of CGEQRS */
+
+} /* cgeqrs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/cgeqrs.f b/lapack-netlib/SRC/DEPRECATED/cgeqrs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/cgeqrs.f
rename to lapack-netlib/SRC/DEPRECATED/cgeqrs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvd.c b/lapack-netlib/SRC/DEPRECATED/cggsvd.c
index d6f7b5903..1a8a827c1 100644
--- a/lapack-netlib/SRC/DEPRECATED/cggsvd.c
+++ b/lapack-netlib/SRC/DEPRECATED/cggsvd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvd.f b/lapack-netlib/SRC/DEPRECATED/cggsvd.f
index e6fdb47e5..515ac8d49 100644
--- a/lapack-netlib/SRC/DEPRECATED/cggsvd.f
+++ b/lapack-netlib/SRC/DEPRECATED/cggsvd.f
@@ -107,7 +107,7 @@
 *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
 *> A and B implicitly gives the SVD of A*inv(B):
 *>                      A*inv(B) = U*(D1*inv(D2))*V**H.
-*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also
+*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also
 *> equal to the CS decomposition of A and B. Furthermore, the GSVD can
 *> be used to derive the solution of the eigenvalue problem:
 *>                      A**H*A x = lambda* B**H*B x.
diff --git a/lapack-netlib/SRC/DEPRECATED/cggsvp.c b/lapack-netlib/SRC/DEPRECATED/cggsvp.c
index d29338f73..3bf587172 100644
--- a/lapack-netlib/SRC/DEPRECATED/cggsvp.c
+++ b/lapack-netlib/SRC/DEPRECATED/cggsvp.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/clahrd.c b/lapack-netlib/SRC/DEPRECATED/clahrd.c
index ea62964f8..517e4786b 100644
--- a/lapack-netlib/SRC/DEPRECATED/clahrd.c
+++ b/lapack-netlib/SRC/DEPRECATED/clahrd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/clatzm.c b/lapack-netlib/SRC/DEPRECATED/clatzm.c
index 30393dc26..747f5bc11 100644
--- a/lapack-netlib/SRC/DEPRECATED/clatzm.c
+++ b/lapack-netlib/SRC/DEPRECATED/clatzm.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/ctzrqf.c b/lapack-netlib/SRC/DEPRECATED/ctzrqf.c
index 537e77dc6..3986f7854 100644
--- a/lapack-netlib/SRC/DEPRECATED/ctzrqf.c
+++ b/lapack-netlib/SRC/DEPRECATED/ctzrqf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,252 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegs.c b/lapack-netlib/SRC/DEPRECATED/dgegs.c
index 70fb1aa3b..f034821a4 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegs.c
+++ b/lapack-netlib/SRC/DEPRECATED/dgegs.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
+
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegs.f b/lapack-netlib/SRC/DEPRECATED/dgegs.f
index 0ac0112c2..02e9fdcb2 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/dgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> DGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegv.c b/lapack-netlib/SRC/DEPRECATED/dgegv.c
index c52b4b7d2..38df37116 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegv.c
+++ b/lapack-netlib/SRC/DEPRECATED/dgegv.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dgegv.f b/lapack-netlib/SRC/DEPRECATED/dgegv.f
index 7e81c85a9..0b5c48922 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/dgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> DGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> DGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/dgelqs.c b/lapack-netlib/SRC/DEPRECATED/dgelqs.c
new file mode 100644
index 000000000..e3cf1e029
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/dgelqs.c
@@ -0,0 +1,480 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublereal c_b7 = 1.;
+static doublereal c_b9 = 0.;
+
+/* > \brief \b DGELQS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE DGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Compute a minimum-norm solution */
+/* >     f2cmin || A*X - B || */
+/* > using the LQ factorization */
+/* >     A = L*Q */
+/* > computed by DGELQF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is DOUBLE PRECISION array, dimension (LDA,N) */
+/* >          Details of the LQ factorization of the original matrix A as */
+/* >          returned by DGELQF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is DOUBLE PRECISION array, dimension (M) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is DOUBLE PRECISION array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= N. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is DOUBLE PRECISION array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup double_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int dgelqs_(integer *m, integer *n, integer *nrhs, 
+	doublereal *a, integer *lda, doublereal *tau, doublereal *b, integer *
+	ldb, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *), dlaset_(
+	    char *, integer *, integer *, doublereal *, doublereal *, 
+	    doublereal *, integer *), xerbla_(char *, integer *), dormlq_(char *, char *, integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *m > *n) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGELQS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Solve L*X = B(1:m,:) */
+
+    dtrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b7, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+/*     Set B(m+1:n,:) to zero */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	dlaset_("Full", &i__1, nrhs, &c_b9, &c_b9, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     B := Q' * B */
+
+    dormlq_("Left", "Transpose", n, nrhs, m, &a[a_offset], lda, &tau[1], &b[
+	    b_offset], ldb, &work[1], lwork, info);
+
+    return 0;
+
+/*     End of DGELQS */
+
+} /* dgelqs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/dgelqs.f b/lapack-netlib/SRC/DEPRECATED/dgelqs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/dgelqs.f
rename to lapack-netlib/SRC/DEPRECATED/dgelqs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/dgelsx.c b/lapack-netlib/SRC/DEPRECATED/dgelsx.c
index 86977c94e..5abeebe84 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgelsx.c
+++ b/lapack-netlib/SRC/DEPRECATED/dgelsx.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dgeqpf.c b/lapack-netlib/SRC/DEPRECATED/dgeqpf.c
index 94a431523..21c6a5c7d 100644
--- a/lapack-netlib/SRC/DEPRECATED/dgeqpf.c
+++ b/lapack-netlib/SRC/DEPRECATED/dgeqpf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/dgeqrs.c b/lapack-netlib/SRC/DEPRECATED/dgeqrs.c
new file mode 100644
index 000000000..70236738a
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/dgeqrs.c
@@ -0,0 +1,471 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublereal c_b9 = 1.;
+
+/* > \brief \b DGEQRS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE DGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       DOUBLE PRECISION   A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Solve the least squares problem */
+/* >     f2cmin || A*X - B || */
+/* > using the QR factorization */
+/* >     A = Q*R */
+/* > computed by DGEQRF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  M >= N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is DOUBLE PRECISION array, dimension (LDA,N) */
+/* >          Details of the QR factorization of the original matrix A as */
+/* >          returned by DGEQRF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is DOUBLE PRECISION array, dimension (N) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is DOUBLE PRECISION array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is DOUBLE PRECISION array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup double_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int dgeqrs_(integer *m, integer *n, integer *nrhs, 
+	doublereal *a, integer *lda, doublereal *tau, doublereal *b, integer *
+	ldb, doublereal *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int dtrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *), xerbla_(
+	    char *, integer *), dormqr_(char *, char *, integer *, 
+	    integer *, integer *, doublereal *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*m)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEQRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     B := Q' * B */
+
+    dormqr_("Left", "Transpose", m, nrhs, n, &a[a_offset], lda, &tau[1], &b[
+	    b_offset], ldb, &work[1], lwork, info);
+
+/*     Solve R*X = B(1:n,:) */
+
+    dtrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b9, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+    return 0;
+
+/*     End of DGEQRS */
+
+} /* dgeqrs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/dgeqrs.f b/lapack-netlib/SRC/DEPRECATED/dgeqrs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/dgeqrs.f
rename to lapack-netlib/SRC/DEPRECATED/dgeqrs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/dggsvd.c b/lapack-netlib/SRC/DEPRECATED/dggsvd.c
index 71a4010d4..e5993b833 100644
--- a/lapack-netlib/SRC/DEPRECATED/dggsvd.c
+++ b/lapack-netlib/SRC/DEPRECATED/dggsvd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dggsvp.c b/lapack-netlib/SRC/DEPRECATED/dggsvp.c
index 593d7e492..b3a53fad1 100644
--- a/lapack-netlib/SRC/DEPRECATED/dggsvp.c
+++ b/lapack-netlib/SRC/DEPRECATED/dggsvp.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/dlahrd.c b/lapack-netlib/SRC/DEPRECATED/dlahrd.c
index 8b74436c4..30f7a0f7a 100644
--- a/lapack-netlib/SRC/DEPRECATED/dlahrd.c
+++ b/lapack-netlib/SRC/DEPRECATED/dlahrd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,251 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
 
 
 
diff --git a/lapack-netlib/SRC/DEPRECATED/dlatzm.c b/lapack-netlib/SRC/DEPRECATED/dlatzm.c
index 39a00abf5..698b09e85 100644
--- a/lapack-netlib/SRC/DEPRECATED/dlatzm.c
+++ b/lapack-netlib/SRC/DEPRECATED/dlatzm.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/dtzrqf.c b/lapack-netlib/SRC/DEPRECATED/dtzrqf.c
index a5cf3dcfe..ff47d4177 100644
--- a/lapack-netlib/SRC/DEPRECATED/dtzrqf.c
+++ b/lapack-netlib/SRC/DEPRECATED/dtzrqf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,252 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegs.c b/lapack-netlib/SRC/DEPRECATED/sgegs.c
index 48cce1ede..6ab9050e0 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegs.c
+++ b/lapack-netlib/SRC/DEPRECATED/sgegs.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegs.f b/lapack-netlib/SRC/DEPRECATED/sgegs.f
index 2ed9ad942..11ecc67ac 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/sgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> SGEGS computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> SGEGS computes the eigenvalues, real Schur form, and, optionally, the left and/or right Schur vectors of a real matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegv.c b/lapack-netlib/SRC/DEPRECATED/sgegv.c
index eb129ab65..39b2d6fb6 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegv.c
+++ b/lapack-netlib/SRC/DEPRECATED/sgegv.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/sgegv.f b/lapack-netlib/SRC/DEPRECATED/sgegv.f
index 7a179a499..97556e371 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/sgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> SGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> SGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a real matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/sgelqs.c b/lapack-netlib/SRC/DEPRECATED/sgelqs.c
new file mode 100644
index 000000000..03034b0dc
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/sgelqs.c
@@ -0,0 +1,472 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/* Table of constant values */
+
+static real c_b7 = 1.f;
+static real c_b9 = 0.f;
+
+/* > \brief \b SGELQS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE SGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       REAL               A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Compute a minimum-norm solution */
+/* >     f2cmin || A*X - B || */
+/* > using the LQ factorization */
+/* >     A = L*Q */
+/* > computed by SGELQF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is REAL array, dimension (LDA,N) */
+/* >          Details of the LQ factorization of the original matrix A as */
+/* >          returned by SGELQF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is REAL array, dimension (M) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is REAL array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= N. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is REAL array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup single_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int sgelqs_(integer *m, integer *n, integer *nrhs, real *a, 
+	integer *lda, real *tau, real *b, integer *ldb, real *work, integer *
+	lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int strsm_(char *, char *, char *, char *, 
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), xerbla_(char *, integer *), slaset_(char *, integer *, integer *, real *, real *, 
+	    real *, integer *), sormlq_(char *, char *, integer *, 
+	    integer *, integer *, real *, integer *, real *, real *, integer *
+	    , real *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *m > *n) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGELQS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Solve L*X = B(1:m,:) */
+
+    strsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b7, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+/*     Set B(m+1:n,:) to zero */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	slaset_("Full", &i__1, nrhs, &c_b9, &c_b9, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     B := Q' * B */
+
+    sormlq_("Left", "Transpose", n, nrhs, m, &a[a_offset], lda, &tau[1], &b[
+	    b_offset], ldb, &work[1], lwork, info);
+
+    return 0;
+
+/*     End of SGELQS */
+
+} /* sgelqs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/sgelqs.f b/lapack-netlib/SRC/DEPRECATED/sgelqs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/sgelqs.f
rename to lapack-netlib/SRC/DEPRECATED/sgelqs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/sgelsx.c b/lapack-netlib/SRC/DEPRECATED/sgelsx.c
index a887910c9..b2c480481 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgelsx.c
+++ b/lapack-netlib/SRC/DEPRECATED/sgelsx.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/sgeqpf.c b/lapack-netlib/SRC/DEPRECATED/sgeqpf.c
index ac672f804..67121341e 100644
--- a/lapack-netlib/SRC/DEPRECATED/sgeqpf.c
+++ b/lapack-netlib/SRC/DEPRECATED/sgeqpf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/sgeqrs.c b/lapack-netlib/SRC/DEPRECATED/sgeqrs.c
new file mode 100644
index 000000000..b593d0dc9
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/sgeqrs.c
@@ -0,0 +1,470 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static real c_b9 = 1.f;
+
+/* > \brief \b SGEQRS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE SGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       REAL               A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Solve the least squares problem */
+/* >     f2cmin || A*X - B || */
+/* > using the QR factorization */
+/* >     A = Q*R */
+/* > computed by SGEQRF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  M >= N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is REAL array, dimension (LDA,N) */
+/* >          Details of the QR factorization of the original matrix A as */
+/* >          returned by SGEQRF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is REAL array, dimension (N) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is REAL array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is REAL array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup single_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int sgeqrs_(integer *m, integer *n, integer *nrhs, real *a, 
+	integer *lda, real *tau, real *b, integer *ldb, real *work, integer *
+	lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int strsm_(char *, char *, char *, char *, 
+	    integer *, integer *, real *, real *, integer *, real *, integer *
+	    ), xerbla_(char *, integer *), sormqr_(char *, char *, integer *, integer *, integer *, 
+	    real *, integer *, real *, real *, integer *, real *, integer *, 
+	    integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*m)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEQRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     B := Q' * B */
+
+    sormqr_("Left", "Transpose", m, nrhs, n, &a[a_offset], lda, &tau[1], &b[
+	    b_offset], ldb, &work[1], lwork, info);
+
+/*     Solve R*X = B(1:n,:) */
+
+    strsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b9, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+    return 0;
+
+/*     End of SGEQRS */
+
+} /* sgeqrs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/sgeqrs.f b/lapack-netlib/SRC/DEPRECATED/sgeqrs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/sgeqrs.f
rename to lapack-netlib/SRC/DEPRECATED/sgeqrs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/sggsvd.c b/lapack-netlib/SRC/DEPRECATED/sggsvd.c
index 0af712876..a10edf060 100644
--- a/lapack-netlib/SRC/DEPRECATED/sggsvd.c
+++ b/lapack-netlib/SRC/DEPRECATED/sggsvd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/sggsvp.c b/lapack-netlib/SRC/DEPRECATED/sggsvp.c
index bf69f4239..ca47bc629 100644
--- a/lapack-netlib/SRC/DEPRECATED/sggsvp.c
+++ b/lapack-netlib/SRC/DEPRECATED/sggsvp.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/slahrd.c b/lapack-netlib/SRC/DEPRECATED/slahrd.c
index 0cf290be2..920288686 100644
--- a/lapack-netlib/SRC/DEPRECATED/slahrd.c
+++ b/lapack-netlib/SRC/DEPRECATED/slahrd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/slatzm.c b/lapack-netlib/SRC/DEPRECATED/slatzm.c
index 1826db1a4..6e53dceca 100644
--- a/lapack-netlib/SRC/DEPRECATED/slatzm.c
+++ b/lapack-netlib/SRC/DEPRECATED/slatzm.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,252 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
 
 
 
diff --git a/lapack-netlib/SRC/DEPRECATED/stzrqf.c b/lapack-netlib/SRC/DEPRECATED/stzrqf.c
index 16cf7e33a..72c0083bb 100644
--- a/lapack-netlib/SRC/DEPRECATED/stzrqf.c
+++ b/lapack-netlib/SRC/DEPRECATED/stzrqf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegs.c b/lapack-netlib/SRC/DEPRECATED/zgegs.c
index 7dab0efe9..c053ec432 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegs.c
+++ b/lapack-netlib/SRC/DEPRECATED/zgegs.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegs.f b/lapack-netlib/SRC/DEPRECATED/zgegs.f
index c5cdd26e5..23f8d43d1 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegs.f
+++ b/lapack-netlib/SRC/DEPRECATED/zgegs.f
@@ -1,4 +1,4 @@
-*> \brief <b> ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> ZGEGS computes the eigenvalues, Schur form, and, optionally, the left and or/right Schur vectors of a complex matrix pair (A,B)</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegv.c b/lapack-netlib/SRC/DEPRECATED/zgegv.c
index 991a6ff11..8272bd918 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegv.c
+++ b/lapack-netlib/SRC/DEPRECATED/zgegv.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/zgegv.f b/lapack-netlib/SRC/DEPRECATED/zgegv.f
index aa4ab3f71..542d3f4ff 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgegv.f
+++ b/lapack-netlib/SRC/DEPRECATED/zgegv.f
@@ -1,4 +1,4 @@
-*> \brief <b> ZGEEVX computes the eigenvalues and, optionally, the left and/or right eigenvectors for GE matrices</b>
+*> \brief <b> ZGEGV computes the eigenvalues and, optionally, the left and/or right eigenvectors of a complex matrix pair (A,B).</b>
 *
 *  =========== DOCUMENTATION ===========
 *
diff --git a/lapack-netlib/SRC/DEPRECATED/zgelqs.c b/lapack-netlib/SRC/DEPRECATED/zgelqs.c
new file mode 100644
index 000000000..b77ba906a
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/zgelqs.c
@@ -0,0 +1,481 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {0.,0.};
+static doublecomplex c_b2 = {1.,0.};
+
+/* > \brief \b ZGELQS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZGELQS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX*16         A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Compute a minimum-norm solution */
+/* >     f2cmin || A*X - B || */
+/* > using the LQ factorization */
+/* >     A = L*Q */
+/* > computed by ZGELQF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  N >= M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 array, dimension (LDA,N) */
+/* >          Details of the LQ factorization of the original matrix A as */
+/* >          returned by ZGELQF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is COMPLEX*16 array, dimension (M) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX*16 array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= N. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX*16 array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int zgelqs_(integer *m, integer *n, integer *nrhs, 
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *b, 
+	integer *ldb, doublecomplex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *), 
+	    xerbla_(char *, integer *), zlaset_(char *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    integer *), zunmlq_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input parameters. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *m > *n) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*n)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGELQS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     Solve L*X = B(1:m,:) */
+
+    ztrsm_("Left", "Lower", "No transpose", "Non-unit", m, nrhs, &c_b2, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+/*     Set B(m+1:n,:) to zero */
+
+    if (*m < *n) {
+	i__1 = *n - *m;
+	zlaset_("Full", &i__1, nrhs, &c_b1, &c_b1, &b[*m + 1 + b_dim1], ldb);
+    }
+
+/*     B := Q' * B */
+
+    zunmlq_("Left", "Conjugate transpose", n, nrhs, m, &a[a_offset], lda, &
+	    tau[1], &b[b_offset], ldb, &work[1], lwork, info);
+
+    return 0;
+
+/*     End of ZGELQS */
+
+} /* zgelqs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/zgelqs.f b/lapack-netlib/SRC/DEPRECATED/zgelqs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/zgelqs.f
rename to lapack-netlib/SRC/DEPRECATED/zgelqs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/zgelsx.c b/lapack-netlib/SRC/DEPRECATED/zgelsx.c
index f1f39eb9c..82d195550 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgelsx.c
+++ b/lapack-netlib/SRC/DEPRECATED/zgelsx.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,247 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
 /*  -- translated by f2c (version 20000121).
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
diff --git a/lapack-netlib/SRC/DEPRECATED/zgeqpf.c b/lapack-netlib/SRC/DEPRECATED/zgeqpf.c
index 12ecc9940..0a2f8c7aa 100644
--- a/lapack-netlib/SRC/DEPRECATED/zgeqpf.c
+++ b/lapack-netlib/SRC/DEPRECATED/zgeqpf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/zgeqrs.c b/lapack-netlib/SRC/DEPRECATED/zgeqrs.c
new file mode 100644
index 000000000..3e8f3cce7
--- /dev/null
+++ b/lapack-netlib/SRC/DEPRECATED/zgeqrs.c
@@ -0,0 +1,472 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {1.,0.};
+
+/* > \brief \b ZGEQRS */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZGEQRS( M, N, NRHS, A, LDA, TAU, B, LDB, WORK, LWORK, */
+/*                          INFO ) */
+
+/*       INTEGER            INFO, LDA, LDB, LWORK, M, N, NRHS */
+/*       COMPLEX*16         A( LDA, * ), B( LDB, * ), TAU( * ), */
+/*      $                   WORK( LWORK ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > Solve the least squares problem */
+/* >     f2cmin || A*X - B || */
+/* > using the QR factorization */
+/* >     A = Q*R */
+/* > computed by ZGEQRF. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] M */
+/* > \verbatim */
+/* >          M is INTEGER */
+/* >          The number of rows of the matrix A.  M >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of columns of the matrix A.  M >= N >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] NRHS */
+/* > \verbatim */
+/* >          NRHS is INTEGER */
+/* >          The number of columns of B.  NRHS >= 0. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 array, dimension (LDA,N) */
+/* >          Details of the QR factorization of the original matrix A as */
+/* >          returned by ZGEQRF. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDA */
+/* > \verbatim */
+/* >          LDA is INTEGER */
+/* >          The leading dimension of the array A.  LDA >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] TAU */
+/* > \verbatim */
+/* >          TAU is COMPLEX*16 array, dimension (N) */
+/* >          Details of the orthogonal matrix Q. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] B */
+/* > \verbatim */
+/* >          B is COMPLEX*16 array, dimension (LDB,NRHS) */
+/* >          On entry, the m-by-nrhs right hand side matrix B. */
+/* >          On exit, the n-by-nrhs solution matrix X. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LDB */
+/* > \verbatim */
+/* >          LDB is INTEGER */
+/* >          The leading dimension of the array B. LDB >= M. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] WORK */
+/* > \verbatim */
+/* >          WORK is COMPLEX*16 array, dimension (LWORK) */
+/* > \endverbatim */
+/* > */
+/* > \param[in] LWORK */
+/* > \verbatim */
+/* >          LWORK is INTEGER */
+/* >          The length of the array WORK.  LWORK must be at least NRHS, */
+/* >          and should be at least NRHS*NB, where NB is the block size */
+/* >          for this environment. */
+/* > \endverbatim */
+/* > */
+/* > \param[out] INFO */
+/* > \verbatim */
+/* >          INFO is INTEGER */
+/* >          = 0: successful exit */
+/* >          < 0: if INFO = -i, the i-th argument had an illegal value */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16_lin */
+
+/*  ===================================================================== */
+/* Subroutine */ int zgeqrs_(integer *m, integer *n, integer *nrhs, 
+	doublecomplex *a, integer *lda, doublecomplex *tau, doublecomplex *b, 
+	integer *ldb, doublecomplex *work, integer *lwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, b_dim1, b_offset, i__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int ztrsm_(char *, char *, char *, char *, 
+	    integer *, integer *, doublecomplex *, doublecomplex *, integer *,
+	     doublecomplex *, integer *), 
+	    xerbla_(char *, integer *), zunmqr_(char *, char *, 
+	    integer *, integer *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+	    integer *, integer *);
+
+
+/*  -- LAPACK test routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test the input arguments. */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --tau;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0 || *n > *m) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -5;
+    } else if (*ldb < f2cmax(1,*m)) {
+	*info = -8;
+    } else if (*lwork < 1 || *lwork < *nrhs && *m > 0 && *n > 0) {
+	*info = -10;
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEQRS", &i__1);
+	return 0;
+    }
+
+/*     Quick return if possible */
+
+    if (*n == 0 || *nrhs == 0 || *m == 0) {
+	return 0;
+    }
+
+/*     B := Q' * B */
+
+    zunmqr_("Left", "Conjugate transpose", m, nrhs, n, &a[a_offset], lda, &
+	    tau[1], &b[b_offset], ldb, &work[1], lwork, info);
+
+/*     Solve R*X = B(1:n,:) */
+
+    ztrsm_("Left", "Upper", "No transpose", "Non-unit", n, nrhs, &c_b1, &a[
+	    a_offset], lda, &b[b_offset], ldb);
+
+    return 0;
+
+/*     End of ZGEQRS */
+
+} /* zgeqrs_ */
+
diff --git a/lapack-netlib/TESTING/LIN/zgeqrs.f b/lapack-netlib/SRC/DEPRECATED/zgeqrs.f
similarity index 100%
rename from lapack-netlib/TESTING/LIN/zgeqrs.f
rename to lapack-netlib/SRC/DEPRECATED/zgeqrs.f
diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvd.c b/lapack-netlib/SRC/DEPRECATED/zggsvd.c
index 8f1c7e46c..e9d723404 100644
--- a/lapack-netlib/SRC/DEPRECATED/zggsvd.c
+++ b/lapack-netlib/SRC/DEPRECATED/zggsvd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,8 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvd.f b/lapack-netlib/SRC/DEPRECATED/zggsvd.f
index c0b9247a6..8a41e36c6 100644
--- a/lapack-netlib/SRC/DEPRECATED/zggsvd.f
+++ b/lapack-netlib/SRC/DEPRECATED/zggsvd.f
@@ -106,7 +106,7 @@
 *> In particular, if B is an N-by-N nonsingular matrix, then the GSVD of
 *> A and B implicitly gives the SVD of A*inv(B):
 *>                      A*inv(B) = U*(D1*inv(D2))*V**H.
-*> If ( A**H,B**H)**H has orthnormal columns, then the GSVD of A and B is also
+*> If ( A**H,B**H)**H has orthonormal columns, then the GSVD of A and B is also
 *> equal to the CS decomposition of A and B. Furthermore, the GSVD can
 *> be used to derive the solution of the eigenvalue problem:
 *>                      A**H*A x = lambda* B**H*B x.
diff --git a/lapack-netlib/SRC/DEPRECATED/zggsvp.c b/lapack-netlib/SRC/DEPRECATED/zggsvp.c
index cbe9a9083..0409e97b1 100644
--- a/lapack-netlib/SRC/DEPRECATED/zggsvp.c
+++ b/lapack-netlib/SRC/DEPRECATED/zggsvp.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,248 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
+/*
    You must link the resulting object file with the libraries:
 	-lf2c -lm   (in that order)
 */
diff --git a/lapack-netlib/SRC/DEPRECATED/zlahrd.c b/lapack-netlib/SRC/DEPRECATED/zlahrd.c
index 3c86494cc..41c26d98e 100644
--- a/lapack-netlib/SRC/DEPRECATED/zlahrd.c
+++ b/lapack-netlib/SRC/DEPRECATED/zlahrd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/zlatzm.c b/lapack-netlib/SRC/DEPRECATED/zlatzm.c
index b3c31d994..25c476524 100644
--- a/lapack-netlib/SRC/DEPRECATED/zlatzm.c
+++ b/lapack-netlib/SRC/DEPRECATED/zlatzm.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/DEPRECATED/ztzrqf.c b/lapack-netlib/SRC/DEPRECATED/ztzrqf.c
index 222cd33c7..3a94fa79a 100644
--- a/lapack-netlib/SRC/DEPRECATED/ztzrqf.c
+++ b/lapack-netlib/SRC/DEPRECATED/ztzrqf.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 8cac42330..205a32d31 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -85,7 +85,7 @@ ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \
    ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o
 
 ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" ""
-SCLAUX = \
+SCLAUX = la_constants.o \
    sbdsvdx.o sstevx.o sstein.o \
    sbdsdc.o \
    sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o  slaebz.o \
@@ -101,12 +101,14 @@ SCLAUX = \
    slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \
    slasr.o  slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \
    ssteqr.o ssterf.o slaisnan.o sisnan.o \
-   slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o \
-   ../INSTALL/second_$(TIMER).o
+   slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o
+ifneq ($(F_COMPILER), IBM)
+SCLAUX += ../INSTALL/second_$(TIMER).o
+endif
 endif
 
 ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
-DZLAUX = \
+DZLAUX = la_constants.o\
    dcombssq.o \
    dbdsvdx.o dstevx.o dstein.o \
    dbdsdc.o \
@@ -124,7 +126,10 @@ DZLAUX = \
    dlasr.o  dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \
    dsteqr.o dsterf.o dlaisnan.o disnan.o \
    dlartgp.o dlartgs.o ../INSTALL/droundup_lwork.o \
-   ../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o
+   ../INSTALL/dlamch.o
+ifneq ($(F_COMPILER), IBM)
+DZLAUX +=  ../INSTALL/dsecnd_$(TIMER).o
+endif
 endif
 
 #ifeq ($(BUILD_SINGLE),1)
@@ -136,7 +141,7 @@ SLASRC_O = \
    sgebrd.o sgecon.o sgeequ.o sgees.o  sgeesx.o sgeev.o  sgeevx.o \
    sgehd2.o sgehrd.o sgelq2.o sgelqf.o \
    sgels.o  sgelsd.o sgelss.o sgelsy.o sgeql2.o sgeqlf.o \
-   sgeqp3.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \
+   sgeqp3.o sgeqp3rk.o sgeqr2.o sgeqr2p.o sgeqrf.o sgeqrfp.o sgerfs.o \
    sgerq2.o sgerqf.o sgesc2.o sgesdd.o sgesv.o  sgesvd.o sgesvdx.o sgesvx.o \
    sgetc2.o sgetf2.o sgetri.o \
    sggbak.o sggbal.o sgges.o  sgges3.o sggesx.o \
@@ -151,7 +156,7 @@ SLASRC_O = \
    slangb.o slange.o slangt.o slanhs.o slansb.o slansp.o \
    slansy.o slantb.o slantp.o slantr.o slanv2.o \
    slapll.o slapmt.o \
-   slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
+   slaqgb.o slaqge.o slaqp2.o slaqp2rk.o slaqp3rk.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \
    slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \
    slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \
    slarf.o  slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \
@@ -207,7 +212,7 @@ SLASRC_O = \
    ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \
    ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \
    ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \
-   sgesvdq.o slatrs3.o strsyl3.o sgelst.o
+   sgesvdq.o slatrs3.o strsyl3.o sgelst.o sgedmd.o sgedmdq.o
    
 endif
 
@@ -232,7 +237,7 @@ CLASRC_O = \
    cgbtf2.o cgbtrf.o cgbtrs.o cgebak.o cgebal.o cgebd2.o cgebrd.o \
    cgecon.o cgeequ.o cgees.o  cgeesx.o cgeev.o  cgeevx.o \
    cgehd2.o cgehrd.o cgelq2.o cgelqf.o \
-   cgels.o  cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o \
+   cgels.o  cgelsd.o cgelss.o cgelsy.o cgeql2.o cgeqlf.o cgeqp3.o cgeqp3rk.o \
    cgeqr2.o cgeqr2p.o cgeqrf.o cgeqrfp.o cgerfs.o \
    cgerq2.o cgerqf.o cgesc2.o cgesdd.o cgesv.o  cgesvd.o  cgesvdx.o \
    cgesvj.o cgejsv.o cgsvj0.o cgsvj1.o \
@@ -266,7 +271,7 @@ CLASRC_O = \
    clanhb.o clanhe.o \
    clanhp.o clanhs.o clanht.o clansb.o clansp.o clansy.o clantb.o \
    clantp.o clantr.o clapll.o clapmt.o clarcm.o claqgb.o claqge.o \
-   claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \
+   claqhb.o claqhe.o claqhp.o claqp2.o claqp2rk.o claqp3rk.o claqps.o claqsb.o \
    claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \
    claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \
    claqz0.o claqz1.o claqz2.o claqz3.o \
@@ -280,7 +285,7 @@ CLASRC_O = \
    cposv.o  cposvx.o cpotf2.o cpotri.o cpstrf.o cpstf2.o \
    cppcon.o cppequ.o cpprfs.o cppsv.o  cppsvx.o cpptrf.o cpptri.o cpptrs.o \
    cptcon.o cpteqr.o cptrfs.o cptsv.o  cptsvx.o cpttrf.o cpttrs.o cptts2.o \
-   crot.o   cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
+   crot.o crscl.o cspcon.o cspmv.o  cspr.o   csprfs.o cspsv.o \
    cspsvx.o csptrf.o csptri.o csptrs.o csrscl.o cstedc.o \
    cstegr.o cstein.o csteqr.o \
    csycon.o csymv.o \
@@ -316,7 +321,7 @@ CLASRC_O = \
    chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \
    cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \
    chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \
-   cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o
+   cgesvdq.o clatrs3.o ctrsyl3.o cgelst.o cgedmd.o cgedmdq.o
 endif
 
 ifdef USEXBLAS
@@ -345,7 +350,7 @@ DLASRC_O = \
    dgebrd.o dgecon.o dgeequ.o dgees.o  dgeesx.o dgeev.o  dgeevx.o \
    dgehd2.o dgehrd.o dgelq2.o dgelqf.o \
    dgels.o  dgelsd.o dgelss.o dgelsy.o dgeql2.o dgeqlf.o \
-   dgeqp3.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \
+   dgeqp3.o dgeqp3rk.o dgeqr2.o dgeqr2p.o dgeqrf.o dgeqrfp.o dgerfs.o \
    dgerq2.o dgerqf.o dgesc2.o dgesdd.o dgesv.o  dgesvd.o dgesvdx.o dgesvx.o \
    dgetc2.o dgetf2.o dgetrf.o dgetri.o \
    dgetrs.o dggbak.o dggbal.o dgges.o  dgges3.o dggesx.o \
@@ -360,7 +365,7 @@ DLASRC_O = \
    dlangb.o dlange.o dlangt.o dlanhs.o dlansb.o dlansp.o \
    dlansy.o dlantb.o dlantp.o dlantr.o dlanv2.o \
    dlapll.o dlapmt.o \
-   dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
+   dlaqgb.o dlaqge.o dlaqp2.o dlaqp2rk.o dlaqp3rk.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \
    dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \
    dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \
    dlarf.o  dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \
@@ -417,7 +422,7 @@ DLASRC_O = \
    dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \
    dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \
    dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \
-   dgesvdq.o dlatrs3.o dtrsyl3.o dgelst.o
+   dgesvdq.o dlatrs3.o dtrsyl3.o dgelst.o dgedmd.o dgedmdq.o
 endif
 
 ifdef USEXBLAS
@@ -437,7 +442,7 @@ ZLASRC_O = \
    zgbtf2.o zgbtrf.o zgbtrs.o zgebak.o zgebal.o zgebd2.o zgebrd.o \
    zgecon.o zgeequ.o zgees.o  zgeesx.o zgeev.o  zgeevx.o \
    zgehd2.o zgehrd.o zgelq2.o zgelqf.o \
-   zgels.o  zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o \
+   zgels.o  zgelsd.o zgelss.o zgelsy.o zgeql2.o zgeqlf.o zgeqp3.o zgeqp3rk.o \
    zgeqr2.o zgeqr2p.o zgeqrf.o zgeqrfp.o zgerfs.o zgerq2.o zgerqf.o \
    zgesc2.o zgesdd.o zgesv.o  zgesvd.o zgesvdx.o \
    zgesvj.o zgejsv.o zgsvj0.o zgsvj1.o \
@@ -473,7 +478,7 @@ ZLASRC_O = \
    zlanhe.o \
    zlanhp.o zlanhs.o zlanht.o zlansb.o zlansp.o zlansy.o zlantb.o \
    zlantp.o zlantr.o zlapll.o zlapmt.o zlaqgb.o zlaqge.o \
-   zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \
+   zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqp2rk.o zlaqp3rk.o zlaqps.o zlaqsb.o \
    zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \
    zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \
    zlaqz0.o zlaqz1.o zlaqz2.o zlaqz3.o \
@@ -488,7 +493,7 @@ ZLASRC_O = \
    zposv.o  zposvx.o zpotf2.o zpotrf.o zpotri.o zpotrs.o zpstrf.o zpstf2.o \
    zppcon.o zppequ.o zpprfs.o zppsv.o  zppsvx.o zpptrf.o zpptri.o zpptrs.o \
    zptcon.o zpteqr.o zptrfs.o zptsv.o  zptsvx.o zpttrf.o zpttrs.o zptts2.o \
-   zrot.o   zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
+   zrot.o zrscl.o  zspcon.o zspmv.o  zspr.o   zsprfs.o zspsv.o \
    zspsvx.o zsptrf.o zsptri.o zsptrs.o zdrscl.o zstedc.o \
    zstegr.o zstein.o zsteqr.o \
    zsycon.o zsymv.o \
@@ -526,7 +531,7 @@ ZLASRC_O = \
    zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
    zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \
    zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \
-   zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o
+   zgesvdq.o zlatrs3.o ztrsyl3.o zgelst.o zgedmd.o zgedmdq.o
 endif
 
 ifdef USEXBLAS
@@ -544,26 +549,30 @@ endif
 ifeq ($(BUILD_COMPLEX),1)
 CDEPRECSRC = DEPRECATED/cgegs.o DEPRECATED/cgegv.o DEPRECATED/cgelsx.o \
    DEPRECATED/cgeqpf.o DEPRECATED/cggsvd.o DEPRECATED/cggsvp.o \
-   DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o 
+   DEPRECATED/clahrd.o DEPRECATED/clatzm.o DEPRECATED/ctzrqf.o \
+   DEPRECATED/cgelqs.o DEPRECATED/cgeqrs.o
 endif
 
 ifeq ($(BUILD_DOUBLE),1)
 DDEPRECSRC = \
    DEPRECATED/dgegs.o  DEPRECATED/dgegv.o  DEPRECATED/dgelsx.o \
    DEPRECATED/dgeqpf.o DEPRECATED/dggsvd.o DEPRECATED/dggsvp.o \
-   DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o 
+   DEPRECATED/dlahrd.o DEPRECATED/dlatzm.o DEPRECATED/dtzrqf.o \
+   DEPRECATED/dgelqs.o DEPRECATED/dgeqrs.o
 endif
 ifeq ($(BUILD_SINGLE),1)
 SDEPRECSRC = \
    DEPRECATED/sgegs.o  DEPRECATED/sgegv.o  DEPRECATED/sgelsx.o \
    DEPRECATED/sgeqpf.o DEPRECATED/sggsvd.o DEPRECATED/sggsvp.o \
-   DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o
+   DEPRECATED/slahrd.o DEPRECATED/slatzm.o DEPRECATED/stzrqf.o \
+   DEPRECATED/sgelqs.o DEPRECATED/sgeqrs.o
 endif
 ifeq ($(BUILD_COMPLEX16),1)
 ZDEPRECSRC = \
    DEPRECATED/zgegs.o  DEPRECATED/zgegv.o  DEPRECATED/zgelsx.o \
    DEPRECATED/zgeqpf.o DEPRECATED/zggsvd.o DEPRECATED/zggsvp.o \
-   DEPRECATED/zlahrd.o DEPRECATED/zlatzm.o DEPRECATED/ztzrqf.o
+   DEPRECATED/zlahrd.o DEPRECATED/zlatzm.o DEPRECATED/ztzrqf.o \
+   DEPRECATED/zgelqs.o DEPRECATED/zgeqrs.o
 endif
 
 # filter out optimized codes from OpenBLAS
diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile
index 25d8ee175..35e50cbc2 100644
--- a/lapack-netlib/SRC/VARIANTS/Makefile
+++ b/lapack-netlib/SRC/VARIANTS/Makefile
@@ -28,7 +28,7 @@ LULL = lu/LL/cgetrf.o lu/LL/dgetrf.o lu/LL/sgetrf.o lu/LL/zgetrf.o
 
 LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o
 
-QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o
+QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o
 
 
 .PHONY: all
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
index 16d250c3f..1b8e53cc2 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/cpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b CPOTRF VARIANT: right looking block version of the algorithm, calli
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> CPOTRF computes the Cholesky factorization of a real Hermitian
+C> CPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f
index 1575bd95c..e0a621b2e 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/dpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f
index 67ebae335..f3b66a9e3 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/spotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
index 76d8bab61..dda42faf9 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/RL/zpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: right looking block version of the algorithm, calli
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> ZPOTRF computes the Cholesky factorization of a real Hermitian
+C> ZPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
index d6149c08f..b9dffa4ec 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/cpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b CPOTRF VARIANT: top-looking block version of the algorithm, calling
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> CPOTRF computes the Cholesky factorization of a real symmetric
+C> CPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form
@@ -55,7 +55,7 @@ C>
 C> \param[in,out] A
 C> \verbatim
 C>          A is COMPLEX array, dimension (LDA,N)
-C>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+C>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
 C>          N-by-N upper triangular part of A contains the upper
 C>          triangular part of the matrix A, and the strictly lower
 C>          triangular part of A is not referenced.  If UPLO = 'L', the
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f
index e49200ea5..e68a559ba 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/dpotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f
index 65895502b..7401f8844 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/spotrf.f
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
index f8b9e253c..80b7c7f43 100644
--- a/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
+++ b/lapack-netlib/SRC/VARIANTS/cholesky/TOP/zpotrf.f
@@ -24,7 +24,7 @@ C> \brief \b ZPOTRF VARIANT: top-looking block version of the algorithm, calling
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> ZPOTRF computes the Cholesky factorization of a real symmetric
+C> ZPOTRF computes the Cholesky factorization of a complex Hermitian
 C> positive definite matrix A.
 C>
 C> The factorization has the form
@@ -55,7 +55,7 @@ C>
 C> \param[in,out] A
 C> \verbatim
 C>          A is COMPLEX*16 array, dimension (LDA,N)
-C>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+C>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
 C>          N-by-N upper triangular part of A contains the upper
 C>          triangular part of the matrix A, and the strictly lower
 C>          triangular part of A is not referenced.  If UPLO = 'L', the
@@ -79,8 +79,8 @@ C> \verbatim
 C>          INFO is INTEGER
 C>          = 0:  successful exit
 C>          < 0:  if INFO = -i, the i-th argument had an illegal value
-C>          > 0:  if INFO = i, the leading minor of order i is not
-C>                positive definite, and the factorization could not be
+C>          > 0:  if INFO = i, the leading principal minor of order i
+C>                is not positive, and the factorization could not be
 C>                completed.
 C> \endverbatim
 C>
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
index 46eaf33b9..d3f78b8be 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/cgeqrf.f
@@ -23,7 +23,7 @@ C> \brief \b CGEQRF VARIANT: left-looking Level 3 BLAS version of the algorithm.
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> CGEQRF computes a QR factorization of a real M-by-N matrix A:
+C> CGEQRF computes a QR factorization of a complex M-by-N matrix A:
 C> A = Q * R.
 C>
 C> This is the left-looking Level 3 BLAS version of the algorithm.
@@ -172,12 +172,12 @@ C>
       EXTERNAL           CGEQR2, CLARFB, CLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +205,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -226,13 +226,13 @@ C>
 *         Optimal workspace for dlarfb = MAX(1,N)*NT
 *
           LWKOPT = (LBWORK+LLWORK)*NB
-          WORK( 1 ) = (LWKOPT+NT*NT)
+          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT+NT*NT)
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
-          WORK( 1 ) = LWKOPT
+          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 
       END IF
 
@@ -414,7 +414,7 @@ C>
 
       END IF
 
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGEQRF
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
index 55cab8b23..dd8baab4d 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/dgeqrf.f
@@ -172,12 +172,12 @@ C>
       EXTERNAL           DGEQR2, DLARFB, DLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      DOUBLE PRECISION   DROUNDUP_LWORK
+      EXTERNAL           ILAENV, DROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +205,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -226,13 +226,13 @@ C>
 *         Optimal workspace for dlarfb = MAX(1,N)*NT
 *
           LWKOPT = (LBWORK+LLWORK)*NB
-          WORK( 1 ) = (LWKOPT+NT*NT)
+          WORK( 1 ) = DROUNDUP_LWORK(LWKOPT+NT*NT)
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
-          WORK( 1 ) = LWKOPT
+          WORK( 1 ) = DROUNDUP_LWORK(LWKOPT)
 
       END IF
 
@@ -414,7 +414,7 @@ C>
 
       END IF
 
-      WORK( 1 ) = IWS
+      WORK( 1 ) = DROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of DGEQRF
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f
deleted file mode 100644
index a007360ba..000000000
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/sceil.f
+++ /dev/null
@@ -1,86 +0,0 @@
-C> \brief \b SCEIL
-*
-*  =========== DOCUMENTATION ===========
-*
-* Online html documentation available at
-*            http://www.netlib.org/lapack/explore-html/
-*
-*  Definition:
-*  ===========
-*
-*       REAL FUNCTION SCEIL( A )
-*
-*       .. Scalar Arguments ..
-*       REAL A
-*       ..
-*
-*    =====================================================================
-*
-*       .. Intrinsic Functions ..
-* 	      INTRINSIC          INT
-*       ..
-*       .. Executable Statements ..*
-*
-*       IF (A-INT(A).EQ.0) THEN
-*           SCEIL = A
-*       ELSE IF (A.GT.0) THEN
-*           SCEIL = INT(A)+1;
-*       ELSE
-*           SCEIL = INT(A)
-*       END IF
-*
-*       RETURN
-*
-*       END
-*  Purpose
-*  =======
-*
-C>\details \b Purpose:
-C>\verbatim
-C>\endverbatim
-*
-*  Arguments:
-*  ==========
-*
-*
-*  Authors:
-*  ========
-*
-C> \author Univ. of Tennessee
-C> \author Univ. of California Berkeley
-C> \author Univ. of Colorado Denver
-C> \author NAG Ltd.
-*
-C> \date December 2016
-*
-C> \ingroup variantsOTHERcomputational
-*
-*  =====================================================================
-      REAL FUNCTION SCEIL( A )
-*
-*  -- LAPACK computational routine --
-*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
-*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*
-*     .. Scalar Arguments ..*
-      REAL A
-*     ..
-*
-*  =====================================================================
-*
-*     .. Intrinsic Functions ..
-	      INTRINSIC          INT
-*     ..
-*     .. Executable Statements ..*
-*
-      IF (A-INT(A).EQ.0) THEN
-          SCEIL = A
-      ELSE IF (A.GT.0) THEN
-          SCEIL = INT(A)+1;
-      ELSE
-          SCEIL = INT(A)
-      END IF
-
-      RETURN
-*
-      END
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
index d2ad13ced..93dc48fa5 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/sgeqrf.f
@@ -172,12 +172,12 @@ C>
       EXTERNAL           SGEQR2, SLARFB, SLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      DOUBLE PRECISION   DROUNDUP_LWORK
+      EXTERNAL           ILAENV, DROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +205,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -226,13 +226,13 @@ C>
 *         Optimal workspace for dlarfb = MAX(1,N)*NT
 *
           LWKOPT = (LBWORK+LLWORK)*NB
-          WORK( 1 ) = (LWKOPT+NT*NT)
+          WORK( 1 ) = DROUNDUP_LWORK(LWKOPT+NT*NT)
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
-          WORK( 1 ) = LWKOPT
+          WORK( 1 ) = DROUNDUP_LWORK(LWKOPT)
 
       END IF
 
@@ -414,7 +414,7 @@ C>
 
       END IF
 
-      WORK( 1 ) = IWS
+      WORK( 1 ) = DROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGEQRF
diff --git a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
index 623b88a8a..3ef07bfc7 100644
--- a/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
+++ b/lapack-netlib/SRC/VARIANTS/qr/LL/zgeqrf.f
@@ -23,7 +23,7 @@ C> \brief \b ZGEQRF VARIANT: left-looking Level 3 BLAS of the algorithm.
 C>\details \b Purpose:
 C>\verbatim
 C>
-C> ZGEQRF computes a QR factorization of a real M-by-N matrix A:
+C> ZGEQRF computes a QR factorization of a complex M-by-N matrix A:
 C> A = Q * R.
 C>
 C> This is the left-looking Level 3 BLAS version of the algorithm.
@@ -172,12 +172,12 @@ C>
       EXTERNAL           ZGEQR2, ZLARFB, ZLARFT, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN
+      INTRINSIC          CEILING, MAX, MIN, REAL
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SCEIL
-      EXTERNAL           ILAENV, SCEIL
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 
@@ -205,13 +205,13 @@ C>
 *
 *     So here 4 x 4 is the last T stored in the workspace
 *
-      NT = K-SCEIL(REAL(K-NX)/REAL(NB))*NB
+      NT = K-CEILING(REAL(K-NX)/REAL(NB))*NB
 
 *
 *     optimal workspace = space for dlarfb + space for normal T's + space for the last T
 *
       LLWORK = MAX (MAX((N-M)*K, (N-M)*NB), MAX(K*NB, NB*NB))
-      LLWORK = SCEIL(REAL(LLWORK)/REAL(NB))
+      LLWORK = CEILING(REAL(LLWORK)/REAL(NB))
 
       IF( K.EQ.0 ) THEN
 
@@ -226,13 +226,13 @@ C>
 *         Optimal workspace for dlarfb = MAX(1,N)*NT
 *
           LWKOPT = (LBWORK+LLWORK)*NB
-          WORK( 1 ) = (LWKOPT+NT*NT)
+          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT+NT*NT)
 
       ELSE
 
-          LBWORK = SCEIL(REAL(K)/REAL(NB))*NB
+          LBWORK = CEILING(REAL(K)/REAL(NB))*NB
           LWKOPT = (LBWORK+LLWORK-NB)*NB
-          WORK( 1 ) = LWKOPT
+          WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 
       END IF
 
@@ -414,7 +414,7 @@ C>
 
       END IF
 
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of ZGEQRF
diff --git a/lapack-netlib/SRC/cbbcsd.f b/lapack-netlib/SRC/cbbcsd.f
index 1cd1ffbf1..4d0c45efe 100644
--- a/lapack-netlib/SRC/cbbcsd.f
+++ b/lapack-netlib/SRC/cbbcsd.f
@@ -805,7 +805,7 @@
                CALL SLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1),
      $                       RWORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL SLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1),
+               CALL SLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1),
      $                       RWORK(IU2SN+I-1) )
             ELSE
                CALL SLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1),
diff --git a/lapack-netlib/SRC/cbdsqr.f b/lapack-netlib/SRC/cbdsqr.f
index 1d7c4e09d..cf1459ad2 100644
--- a/lapack-netlib/SRC/cbdsqr.f
+++ b/lapack-netlib/SRC/cbdsqr.f
@@ -204,6 +204,17 @@
 *>          algorithm through its inner loop. The algorithms stops
 *>          (and so fails to converge) if the number of passes
 *>          through the inner loop exceeds MAXITR*N**2.
+*>
+*> \endverbatim
+*
+*> \par Note:
+*  ===========
+*>
+*> \verbatim
+*>  Bug report from Cezary Dendek.
+*>  On November 3rd 2023, the INTEGER variable MAXIT = MAXITR*N**2 is
+*>  removed since it can overflow pretty easily (for N larger or equal
+*>  than 18,919). We instead use MAXITDIVN = MAXITR*N.
 *> \endverbatim
 *
 *  Authors:
@@ -214,7 +225,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup bdsqr
 *
 *  =====================================================================
       SUBROUTINE CBDSQR( UPLO, N, NCVT, NRU, NCC, D, E, VT, LDVT, U,
@@ -255,11 +266,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LOWER, ROTATE
-      INTEGER            I, IDIR, ISUB, ITER, J, LL, LLL, M, MAXIT, NM1,
-     $                   NM12, NM13, OLDLL, OLDM
+      INTEGER            I, IDIR, ISUB, ITER, ITERDIVN, J, LL, LLL, M,
+     $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       REAL               ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL, SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -372,7 +383,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -389,20 +400,21 @@
    40    CONTINUE
    50    CONTINUE
          SMINOA = SMINOA / SQRT( REAL( N ) )
-         THRESH = MAX( TOL*SMINOA, MAXITR*N*N*UNFL )
+         THRESH = MAX( TOL*SMINOA, MAXITR*(N*(N*UNFL)) )
       ELSE
 *
 *        Absolute accuracy desired
 *
-         THRESH = MAX( ABS( TOL )*SMAX, MAXITR*N*N*UNFL )
+         THRESH = MAX( ABS( TOL )*SMAX, MAXITR*(N*(N*UNFL)) )
       END IF
 *
 *     Prepare for main iteration loop for the singular values
 *     (MAXIT is the maximum number of passes through the inner
 *     loop permitted before nonconvergence signalled.)
 *
-      MAXIT = MAXITR*N*N
-      ITER = 0
+      MAXITDIVN = MAXITR*N
+      ITERDIVN = 0
+      ITER = -1
       OLDLL = -1
       OLDM = -1
 *
@@ -418,15 +430,18 @@
 *
       IF( M.LE.1 )
      $   GO TO 160
-      IF( ITER.GT.MAXIT )
-     $   GO TO 200
+      IF( ITER.GE.N ) THEN
+         ITER = ITER - N
+         ITERDIVN = ITERDIVN + 1
+         IF( ITERDIVN.GE.MAXITDIVN )
+     $      GO TO 200
+      END IF
 *
 *     Find diagonal block of matrix to work on
 *
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -435,7 +450,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -517,14 +531,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -545,14 +559,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -562,7 +576,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/cgebal.f b/lapack-netlib/SRC/cgebal.f
index 5d1ebb026..3f54d3937 100644
--- a/lapack-netlib/SRC/cgebal.f
+++ b/lapack-netlib/SRC/cgebal.f
@@ -85,6 +85,7 @@
 *> \verbatim
 *>          ILO is INTEGER
 *> \endverbatim
+*>
 *> \param[out] IHI
 *> \verbatim
 *>          IHI is INTEGER
@@ -154,6 +155,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -183,8 +187,8 @@
       PARAMETER          ( FACTOR = 0.95E+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       REAL               C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -195,10 +199,10 @@
       EXTERNAL           SISNAN, LSAME, ICAMAX, SLAMCH, SCNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CSSCAL, CSWAP, XERBLA
+      EXTERNAL           XERBLA, CSSCAL, CSWAP
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, AIMAG, MAX, MIN, REAL
+      INTRINSIC          ABS, REAL, AIMAG, MAX, MIN
 *
 *     Test the input parameters
 *
@@ -216,176 +220,194 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL CSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL CSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
-*
-   50 CONTINUE
-      DO 70 J = L, 1, -1
+*     Permutation to isolate eigenvalues if possible.
 *
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( REAL( A( J, I ) ).NE.ZERO .OR. AIMAG( A( J, I ) ).NE.
-     $          ZERO )GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR.
+     $                AIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL CSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL CSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. ( REAL( A( I, J ) ).NE.ZERO .OR.
+     $                AIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL CSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL CSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( REAL( A( I, J ) ).NE.ZERO .OR. AIMAG( A( I, J ) ).NE.
-     $          ZERO )GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
-  140 CONTINUE
-      NOCONV = .FALSE.
-*
-      DO 200 I = K, L
-*
-         C = SCNRM2( L-K+1, A( K, I ), 1 )
-         R = SCNRM2( L-K+1, A( I , K ), LDA )
-         ICA = ICAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = ICAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
-*
-*        Guard against zero C or R due to underflow.
-*
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-            IF( SISNAN( C+F+CA+R+G+RA ) ) THEN
 *
-*           Exit if NaN to avoid infinite loop
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-            INFO = -3
-            CALL XERBLA( 'CGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
+         DO I = K, L
 *
-         CALL CSSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL CSSCAL( L, F, A( 1, I ), 1 )
+            C = SCNRM2( L-K+1, A( K, I ), 1 )
+            R = SCNRM2( L-K+1, A( I, K ), LDA )
+            ICA = ICAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = ICAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-  200 CONTINUE
+*           Guard against zero C or R due to underflow.
+*
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
+*
+*           Exit if NaN to avoid infinite loop
 *
-      IF( NOCONV )
-     $   GO TO 140
+            IF( SISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'CGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL CSSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL CSSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/cgebrd.f b/lapack-netlib/SRC/cgebrd.f
index 5687161a5..5920b1cf5 100644
--- a/lapack-netlib/SRC/cgebrd.f
+++ b/lapack-netlib/SRC/cgebrd.f
@@ -123,7 +123,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -148,7 +149,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -225,8 +226,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEBD2, CGEMM, CLABRD, XERBLA
@@ -236,16 +237,24 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
-      WORK( 1 ) = REAL( LWKOPT )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'CGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -253,7 +262,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -265,7 +274,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -284,7 +292,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
@@ -343,7 +351,7 @@
 *
       CALL CGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ),
      $             TAUQ( I ), TAUP( I ), WORK, IINFO )
-      WORK( 1 ) = WS
+      WORK( 1 ) = SROUNDUP_LWORK( WS )
       RETURN
 *
 *     End of CGEBRD
diff --git a/lapack-netlib/SRC/cgecon.f b/lapack-netlib/SRC/cgecon.f
index 6f426c2ab..e018b18bb 100644
--- a/lapack-netlib/SRC/cgecon.f
+++ b/lapack-netlib/SRC/cgecon.f
@@ -105,8 +105,15 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
-*>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          =-5:  if ANORM is NAN or negative.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>                NaNs are illegal values for ANORM, and they propagate to
+*>                the output parameter RCOND.
+*>                Infinity is illegal for ANORM, and it propagates to the output
+*>                parameter RCOND as 0.
+*>          = 1:  if RCOND = NaN, or
+*>                   RCOND = Inf, or
+*>                   the computed norm of the inverse of A is 0.
+*>                In the latter, RCOND = 0 is returned.
 *> \endverbatim
 *
 *  Authors:
@@ -117,7 +124,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gecon
 *
 *  =====================================================================
       SUBROUTINE CGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK,
@@ -147,7 +154,7 @@
       LOGICAL            ONENRM
       CHARACTER          NORMIN
       INTEGER            IX, KASE, KASE1
-      REAL               AINVNM, SCALE, SL, SMLNUM, SU
+      REAL               AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL
       COMPLEX            ZDUM
 *     ..
 *     .. Local Arrays ..
@@ -172,6 +179,8 @@
       CABS1( ZDUM ) = ABS( REAL( ZDUM ) ) + ABS( AIMAG( ZDUM ) )
 *     ..
 *     .. Executable Statements ..
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
 *
 *     Test the input parameters.
 *
@@ -183,7 +192,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN
+      ELSE IF( ANORM.LT.ZERO ) THEN
          INFO = -5
       END IF
       IF( INFO.NE.0 ) THEN
@@ -199,6 +208,13 @@
          RETURN
       ELSE IF( ANORM.EQ.ZERO ) THEN
          RETURN
+      ELSE IF( SISNAN( ANORM ) ) THEN
+         RCOND = ANORM
+         INFO = -5
+         RETURN
+      ELSE IF( ANORM.GT.HUGEVAL ) THEN
+         INFO = -5
+         RETURN
       END IF
 *
       SMLNUM = SLAMCH( 'Safe minimum' )
@@ -256,8 +272,17 @@
 *
 *     Compute the estimate of the reciprocal condition number.
 *
-      IF( AINVNM.NE.ZERO )
-     $   RCOND = ( ONE / AINVNM ) / ANORM
+      IF( AINVNM.NE.ZERO ) THEN
+         RCOND = ( ONE / AINVNM ) / ANORM
+      ELSE
+         INFO = 1
+         RETURN
+      END IF
+*
+*     Check for NaNs and Infs
+*
+      IF( SISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL )
+     $   INFO = 1
 *
    20 CONTINUE
       RETURN
diff --git a/lapack-netlib/SRC/cgedmd.c b/lapack-netlib/SRC/cgedmd.c
new file mode 100644
index 000000000..570395c7b
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmd.c
@@ -0,0 +1,1670 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+
+/* Subroutine */ int cgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, complex *x, integer *ldx, 
+	complex *y, integer *ldy, integer *nrnk, real *tol, integer *k, 
+	complex *eigs, complex *z__, integer *ldz, real *res, complex *b, 
+	integer *ldb, complex *w, integer *ldw, complex *s, integer *lds, 
+	complex *zwork, integer *lzwork, real *rwork, integer *lrwork, 
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2, i__3, 
+	    i__4, i__5;
+    real r__1, r__2;
+    complex q__1, q__2;
+
+    /* Local variables */
+    complex zone;
+    real zero, ssum;
+    integer info1, info2;
+    real xscl1, xscl2;
+    integer i__, j;
+    real scale;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *), cgeev_(char *, 
+	    char *, integer *, complex *, integer *, complex *, complex *, 
+	    integer *, complex *, integer *, complex *, integer *, real *, 
+	    integer *);
+    extern logical lsame_(char *, char *);
+    logical badxy;
+    real small;
+    char jobzl[1];
+    extern /* Subroutine */ int caxpy_(integer *, complex *, complex *, 
+	    integer *, complex *, integer *);
+    logical wntex;
+    complex zzero;
+    extern real scnrm2_(integer *, complex *, integer *);
+    extern /* Subroutine */ int cgesdd_(char *, integer *, integer *, complex 
+	    *, integer *, real *, complex *, integer *, complex *, integer *, 
+	    complex *, integer *, real *, integer *, integer *), 
+	    clascl_(char *, integer *, integer *, real *, real *, integer *, 
+	    integer *, complex *, integer *, integer *);
+    extern integer icamax_(integer *, complex *, integer *);
+    extern real slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+	    *), cgesvd_(char *, char *, integer *, integer *, complex *, 
+	    integer *, real *, complex *, integer *, complex *, integer *, 
+	    complex *, integer *, real *, integer *), clacpy_(
+	    char *, integer *, integer *, complex *, integer *, complex *, 
+	    integer *), xerbla_(char *, integer *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int cgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, complex *, integer *, real *, 
+	    complex *, integer *, complex *, integer *, complex *, integer *, 
+	    real *, integer *, integer *, integer *), classq_(integer *, complex *, integer *, 
+	    real *, real *);
+    logical sccolx, sccoly;
+    extern logical sisnan_(real *);
+    integer lwrsdd, mwrsdd, iminwr;
+    logical wntref, wntvec;
+    real rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork, lwrsvd, mwrsvd, mlrwrk;
+    logical lquery, wntres;
+    char jsvopt[1];
+    integer lwrsvj, mwrsvj;
+    real rdummy[2];
+    extern /* Subroutine */ int mecago_();
+    integer lwrsvq, mwrsvq;
+    real ofl, one;
+    extern /* Subroutine */ int cgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, complex *, integer *, real *, complex *, 
+	    integer *, complex *, integer *, integer *, integer *, integer *, 
+	    complex *, integer *, real *, integer *, integer *);
+
+/*   March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     CGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, CGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, CGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: CGESVD (the QR SVD algorithm) */
+/*     2 :: CGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*   < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, Y contains the data snapshot matrix Y */
+/*   < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1:K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient. */
+/*     The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     right singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by CGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */
+/*     ZWORK is used as complex workspace in the complex SVD, as */
+/*     specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing */
+/*     the eigenvalues of a Rayleigh quotient. */
+/*     If the call to CGEDMD is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the workspace vector ZWORK. */
+/*     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV), */
+/*     where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal */
+/*     LZWORK_SVD is calculated as follows */
+/*     If WHTSVD == 1 :: CGESVD :: */
+/*        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/*     If WHTSVD == 2 :: CGESDD :: */
+/*        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */
+/*     If WHTSVD == 3 :: CGESVDQ :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: CGEJSV :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If on entry LZWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths and returns them in */
+/*     LZWORK(1) and LZWORK(2), respectively. */
+/* ..... */
+/*     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */
+/*     On exit, RWORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */
+/*     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to CGEDMD is only workspace query, then */
+/*     RWORK(1) contains the minimal workspace length. */
+/*     See the description of LRWORK. */
+/* ..... */
+/*     LRWORK (input) INTEGER */
+/*     The minimal length of the workspace vector RWORK. */
+/*     LRWORK is calculated as follows: */
+/*     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where */
+/*     LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */
+/*     for the SVD subroutine determined by the input parameter */
+/*     WHTSVD. */
+/*     If WHTSVD == 1 :: CGESVD :: */
+/*        LRWORK_SVD = 5*MIN(M,N) */
+/*     If WHTSVD == 2 :: CGESDD :: */
+/*        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */
+/*        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */
+/*     If WHTSVD == 3 :: CGESVDQ :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: CGEJSV :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If on entry LRWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     real workspace length and returns it in RWORK(1). */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for  ZWORK, RWORK and */
+/*     IWORK. See the descriptions of ZWORK, RWORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    zero = 0.f;
+    one = 1.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+    zone.r = 1.f, zone.i = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -17;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -20;
+    } else if (*ldw < *n) {
+	*info = -22;
+    } else if (*lds < *n) {
+	*info = -24;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		rwork[1] = 1.f;
+		zwork[1].r = 2.f, zwork[1].i = 0.f;
+		zwork[2].r = 2.f, zwork[2].i = 0.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	iminwr = 1;
+	mlrwrk = f2cmax(1,*n);
+	mlwork = 2;
+	olwork = 2;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of CGESVD: */
+/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n);
+	    mwrsvd = f2cmax(i__1,i__2);
+	    mlwork = f2cmax(mlwork,mwrsvd);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5;
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, 
+			rdummy, &info1);
+		lwrsvd = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvd);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of CGESDD: */
+/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */
+/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */
+/* In LAPACK 3.10.1 RWORK is defined differently. */
+/* Below we take f2cmax over the two versions. */
+/* IMINWR = 8*MIN(M,N) */
+	    mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax(
+		    *m,*n);
+	    mlwork = f2cmax(mlwork,mwrsdd);
+	    iminwr = f2cmin(*m,*n) << 3;
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(*
+		    m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3,
+		    i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n)
+		     << 1) * f2cmin(*m,*n) + f2cmin(*m,*n);
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset],
+			 ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, &
+			iwork[1], &info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) zwork[1].r;
+		lwrsdd = f2cmax(i__1,i__2);
+		olwork = f2cmax(olwork,lwrsdd);
+	    }
+	} else if (*whtsvd == 3) {
+	    cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvq);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0];
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvq = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvq);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+	    cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, 
+		    &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[
+		    1], &c_n1, rdummy, &c_n1, &iwork[1], &info1);
+	    iminwr = iwork[1];
+	    mwrsvj = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvj);
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = 7, i__4 = (integer) rdummy[0];
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvj = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvj);
+	    }
+/*         END SELECT */
+	}
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the CGEEV call */
+/* Computing MAX */
+	i__1 = 1, i__2 = *n << 1;
+	mwrkev = f2cmax(i__1,i__2);
+	mlwork = f2cmax(mlwork,mwrkev);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = *n + (*n << 1);
+	mlrwrk = f2cmax(i__1,i__2);
+	if (lquery) {
+	    cgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], 
+		    ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], &
+		    info1);
+/* LAPACK CALL */
+	    lwrkev = (integer) zwork[1].r;
+	    olwork = f2cmax(olwork,lwrkev);
+	    olwork = f2cmax(2,olwork);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -30;
+	}
+	if (*lrwork < mlrwrk && ! lquery) {
+	    *info = -28;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -26;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	rwork[1] = (real) mlrwrk;
+	zwork[1].r = (real) mlwork, zwork[1].i = 0.f;
+	zwork[2].r = (real) olwork, zwork[2].i = 0.f;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = slamch_("O") * slamch_("P");
+    small = slamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using CLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = SCNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    classq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("CGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], ldx, &info2);
+		    rwork[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    x[i__ * x_dim1 + 1], ldx, &info2);
+/*                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)          ! INTRINSIC */
+/* LAPAC */
+		}
+	    } else {
+		rwork[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("CGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (rwork[i__] > zero) {
+		r__1 = one / rwork[i__];
+		csscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		r__1 = -rwork[i__];
+		r__2 = one / (real) (*m);
+		clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], ldy, &info2);
+/* LAPACK C */
+	    } else if (c_abs(&y[icamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ 
+		    * y_dim1]) != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    csscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using CLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* RWORK(i) = SCNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    classq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("CGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as RWORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    clascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], ldy, &info2);
+		    rwork[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 Y(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    clascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    y[i__ * y_dim1 + 1], ldy, &info2);
+/*                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPA */
+		}
+	    } else {
+		rwork[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (rwork[i__] > zero) {
+		r__1 = one / rwork[i__];
+		csscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		r__1 = -rwork[i__];
+		r__2 = one / (real) (*m);
+		clascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], ldx, &info2);
+/* LAPACK */
+	    } else if (c_abs(&x[icamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ 
+		    * x_dim1]) != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	cgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &
+		info1);
+/* LA */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 2) {
+	cgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, &
+		w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1]
+		, &info1);
+/* LAP */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 3) {
+	i__1 = *lrwork - *n;
+	cgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], 
+		&z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1);
+/* LAPACK CA */
+	clacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 4) {
+	i__1 = *lrwork - *n;
+	cgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, &
+		rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], 
+		lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1);
+	clacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = rwork[*n + 1];
+	xscl2 = rwork[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case CGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    clascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (rwork[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("CGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^H * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^H is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    r__1 = one / rwork[i__];
+	    csscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that CGESVD, CGESVDQ and CGESDD return the */
+/* adjoint matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW )  ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    rwork[*n + i__] = one / rwork[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * w_dim1;
+		i__4 = *n + i__;
+		q__2.r = rwork[i__4], q__2.i = zero;
+		i__5 = i__ + j * w_dim1;
+		q__1.r = q__2.r * w[i__5].r - q__2.i * w[i__5].i, q__1.i = 
+			q__2.r * w[i__5].i + q__2.i * w[i__5].r;
+		w[i__3].r = q__1.r, w[i__3].i = q__1.i;
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside CGEDMD). */
+	cgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset],
+		 ldw, &zzero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRI */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRI */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLAS */
+	clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	cgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* BLAS */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	cgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy,
+		 &zzero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! IN */
+
+/* B */
+	cgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[
+		w_offset], ldw, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRIN */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^H is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		clacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		clacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    cgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[
+	    w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient.  See the description of Z. */
+/* Also, see the description of CGEEV. */
+/* LAPACK CA */
+    if (info1 > 0) {
+/* CGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		cgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */
+		cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[
+			s_offset], lds, &zzero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		clacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    clacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    cgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    cgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zzero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */
+/*           LDS, ZZERO, Z, LDZ) */
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+/* CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the Ritz vectors */
+	if (wntvec) {
+	    cgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zzero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRIN */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__1 = *k;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		q__1.r = -eigs[i__2].r, q__1.i = -eigs[i__2].i;
+		caxpy_(m, &q__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! */
+
+		res[i__] = scnrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	rwork[*n + 1] = xscl1;
+	rwork[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* cgedmd_ */
+
diff --git a/lapack-netlib/SRC/cgedmd.f90 b/lapack-netlib/SRC/cgedmd.f90
new file mode 100644
index 000000000..1413130ec
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmd.f90
@@ -0,0 +1,1151 @@
+!> \brief \b CGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!     SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+!                        M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+!                        K, EIGS, Z, LDZ, RES, B,    LDB,   &
+!                        W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+!                        RWORK, LRWORK, IWORK, LIWORK, INFO )
+!.....
+!     USE                   iso_fortran_env
+!     IMPLICIT NONE
+!     INTEGER, PARAMETER :: WP = real32
+!
+!.....
+!     Scalar arguments
+!     CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!     INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                NRNK, LDZ, LDB, LDW,  LDS, &
+!                                LIWORK, LRWORK, LZWORK
+!     INTEGER,       INTENT(OUT)  :: K, INFO
+!     REAL(KIND=WP), INTENT(IN)   ::    TOL
+!     Array arguments
+!     COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!     COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                        W(LDW,*), S(LDS,*)
+!     COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+!     COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+!     REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+!     REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+!     INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    CGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, CGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, CGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
+!......................................................................
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!......................................................................
+!     Arguments
+!     =========
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
+!.....
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: CGESVD (the QR SVD algorithm)
+!>    2 :: CGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
+!.....
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
+!.....
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
+!.....
+!>    \param[in,out] X
+!>    \verbatim
+!>    X (input/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDX
+!>    \verbatim
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
+!.....
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
+!.....
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
+!.....
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1)
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
+!.....
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
+!.....
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
+!.....
+!>    \param[out] EIGS
+!>    \verbatim
+!>    EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of EIGS contain
+!>    the computed eigenvalues (Ritz values).
+!>    See the descriptions of K, and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!>    is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!>    If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!>    the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!>    is an eigenvector corresponding to EIGS(i). The columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient.
+!>    See the descriptions of EIGS, X and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
+!.....
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs,
+!>    RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!>    See the description of EIGS and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) COMPLEX(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1:K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
+!.....
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient.
+!>    The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    right singular vectors of X.
+!>    \endverbatim
+!.....
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
+!.....
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by CGEEV.
+!>    See the description of K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
+!.....
+!>    \param[out] ZWORK
+!>    \verbatim
+!>    ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!>    ZWORK is used as complex workspace in the complex SVD, as
+!>    specified by WHTSVD (1,2, 3 or 4) and for CGEEV for computing
+!>    the eigenvalues of a Rayleigh quotient.
+!>    If the call to CGEDMD is only workspace query, then
+!>    ZWORK(1) contains the minimal complex workspace length and
+!>    ZWORK(2) is the optimal complex workspace length.
+!>    Hence, the length of work is at least 2.
+!>    See the description of LZWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LZWORK
+!>    \verbatim
+!>    LZWORK (input) INTEGER
+!>    The minimal length of the workspace vector ZWORK.
+!>    LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_CGEEV),
+!>    where LZWORK_CGEEV = MAX( 1, 2*N )  and the minimal
+!>    LZWORK_SVD is calculated as follows
+!>    If WHTSVD == 1 :: CGESVD ::
+!>       LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!>    If WHTSVD == 2 :: CGESDD ::
+!>       LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!>    If WHTSVD == 3 :: CGESVDQ ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: CGEJSV ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If on entry LZWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths and returns them in
+!>    LZWORK(1) and LZWORK(2), respectively.
+!>    \endverbatim
+!.....
+!>    \param[out] RWORK
+!>    \verbatim
+!>    RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!>    On exit, RWORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!>    scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to CGEDMD is only workspace query, then
+!>    RWORK(1) contains the minimal workspace length.
+!>    See the description of LRWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LRWORK
+!>    \verbatim
+!>    LRWORK (input) INTEGER
+!>    The minimal length of the workspace vector RWORK.
+!>    LRWORK is calculated as follows:
+!>    LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_CGEEV), where
+!>    LRWORK_CGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!>    for the SVD subroutine determined by the input parameter
+!>    WHTSVD.
+!>    If WHTSVD == 1 :: CGESVD ::
+!>       LRWORK_SVD = 5*MIN(M,N)
+!>    If WHTSVD == 2 :: CGESDD ::
+!>       LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!>       2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!>    If WHTSVD == 3 :: CGESVDQ ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: CGEJSV ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If on entry LRWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    real workspace length and returns it in RWORK(1).
+!>    \endverbatim
+!.....
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for  ZWORK, RWORK and
+!>    IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!>    \endverbatim
+!.....
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
+!.............................................................
+!.............................................................
+      SUBROUTINE CGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LIWORK, LRWORK, LZWORK
+      INTEGER,       INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                         W(LDW,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+      COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
+                       SSUM,  XSCL1,  XSCL2
+      INTEGER       ::  i,  j, IMINWR,  INFO1, INFO2,   &
+                        LWRKEV, LWRSDD, LWRSVD, LWRSVJ, &
+                       LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                       MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                       OLWORK, MLRWRK
+      LOGICAL       ::  BADXY, LQUERY, SCCOLX, SCCOLY, &
+                        WNTEX, WNTREF, WNTRES, WNTVEC
+      CHARACTER     ::  JOBZL, T_OR_N
+      CHARACTER     ::  JSVOPT
+!
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: RDUMMY(2)
+!
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) CLANGE, SLAMCH, SCNRM2
+      EXTERNAL      CLANGE, SLAMCH, SCNRM2, ICAMAX
+      INTEGER                               ICAMAX
+      LOGICAL       SISNAN, LSAME
+      EXTERNAL      SISNAN, LSAME
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      CAXPY,  CGEMM,  CSSCAL
+      EXTERNAL      CGEEV,  CGEJSV, CGESDD, CGESVD, CGESVDQ, &
+                    CLACPY, CLASCL, CLASSQ, XERBLA
+!
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     FLOAT, INT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) &
+                                  .OR. ( LRWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -17
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -20
+      ELSE IF ( LDW < N ) THEN
+          INFO = -22
+      ELSE IF ( LDS < N ) THEN
+          INFO = -24
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                RWORK(1) = 1
+                ZWORK(1) = 2
+                ZWORK(2) = 2
+            ELSE
+               K   =  0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+
+         IMINWR = 1
+         MLRWRK = MAX(1,N)
+         MLWORK = 2
+         OLWORK = 2
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of CGESVD:
+             ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MLWORK = MAX(MLWORK,MWRSVD)
+             MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N))
+             IF ( LQUERY ) THEN
+                CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, &
+                     B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 )
+                LWRSVD = INT( ZWORK(1) )
+                OLWORK = MAX(OLWORK,LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of CGESDD:
+             ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+             ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N)
+             ! In LAPACK 3.10.1 RWORK is defined differently.
+             ! Below we take max over the two versions.
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+             MLWORK = MAX(MLWORK,MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             MLRWRK = MAX( MLRWRK,  N +                    &
+                      MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), &
+                           5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), &
+                           2*MAX(M,N)*MIN(M,N)+            &
+                           2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+             IF ( LQUERY ) THEN
+                CALL CGESDD( 'O', M, N, X, LDX, RWORK, B,     &
+                     LDB, W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 )
+                LWRSDD = MAX(MWRSDD,INT( ZWORK(1) ))
+                OLWORK = MAX(OLWORK,LWRSDD)
+             END IF
+         CASE (3)
+             CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                  X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK,  &
+                  IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVQ)
+             MLRWRK = MAX(MLRWRK,N + INT(RDUMMY(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVQ)
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW,       &
+                   ZWORK, -1, RDUMMY, -1, IWORK, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVJ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVJ)
+             MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1))))
+             IF ( LQUERY ) THEN
+                LWRSVJ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the CGEEV call
+         MWRKEV = MAX( 1, 2*N )
+         MLWORK = MAX(MLWORK,MWRKEV)
+         MLRWRK = MAX(MLRWRK,N+2*N)
+         IF ( LQUERY ) THEN
+             CALL CGEEV( 'N', JOBZL, N, S, LDS, EIGS, &
+              W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 ) ! LAPACK CALL
+                LWRKEV = INT(ZWORK(1))
+                OLWORK = MAX( OLWORK, LWRKEV )
+                OLWORK = MAX( 2, OLWORK )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30
+         IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26
+
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'CGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          RWORK(1) = MLRWRK
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = SLAMCH('O')*SLAMCH('P')
+      SMALL = SLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using CLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = SCNRM2( M, X(1,i), 1 )
+            SSUM  = ONE
+            SCALE = ZERO
+            CALL CLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('CGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), LDX, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               X(1,i), LDX, INFO2 )             ! LAPACK CALL
+!                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+          ! All columns of X are zero. Return error code -8.
+          ! (the 8th input variable had an illegal value)
+          K = 0
+          INFO = -8
+          CALL XERBLA('CGEDMD',-INFO)
+          RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL CSSCAL( M, ONE/RWORK(i), Y(1,i), 1 ) ! BLAS CALL
+!               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)      ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL CLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/FLOAT(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(Y(ICAMAX(M, Y(1,i),1),i ))  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL CSSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using CLASSQ.
+          DO i = 1, N
+            !RWORK(i) = SCNRM2( M, Y(1,i), 1 )
+            SSUM  = ONE
+            SCALE = ZERO
+            CALL CLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('CGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as RWORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL CLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), LDY, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 Y(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL CLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               Y(1,i), LDY, INFO2 )              ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL CSSCAL( M, ONE/RWORK(i), X(1,i), 1 )  ! BLAS CALL
+!               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL CLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/FLOAT(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(X(ICAMAX(M, X(1,i),1),i ))  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL CGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, &
+                  LDB, W, LDW, ZWORK, LZWORK,  RWORK(N+1), INFO1 ) ! LAPACK CALL
+             T_OR_N = 'C'
+         CASE (2)
+            CALL CGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, &
+                 LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'C'
+         CASE (3)
+              CALL CGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, ZWORK,     &
+                   LZWORK, RWORK(N+1), LRWORK-N, INFO1)     ! LAPACK CALL
+              CALL CLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'C'
+         CASE (4)
+              CALL CGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL CLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = RWORK(N+1)
+              XSCL2 = RWORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case CGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL CLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( RWORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('CGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( RWORK(i+1) <= RWORK(i)*TOL  ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( RWORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^H * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^H is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL CSSCAL( N, ONE/RWORK(i), W(1,i), 1 )   ! BLAS CALL
+           ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that CGESVD, CGESVDQ and CGESDD return the
+          ! adjoint matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL DSCAL( N, ONE/RWORK(i), W(i,1), LDW )  ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              RWORK(N+i) = ONE/RWORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside CGEDMD).
+          CALL CGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, &
+                      LDW, ZZERO, Z, LDZ )                       ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRINSIC, for T_OR_N=='T'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL CGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, &
+                      LDZ, ZZERO, S, LDS )                       ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^H * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL CGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, &
+                   ZZERO, Z, LDZ )                                  ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! INTRINSIC
+        !
+        CALL CGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, &
+                    LDW, ZZERO, S, LDS )                        ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^H * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^H is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL CLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL CLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL CGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, &
+           LDW, W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 )  ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient.  See the description of Z.
+      ! Also, see the description of CGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! CGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+          IF ( WNTRES ) THEN
+              IF ( WNTREF ) THEN
+                ! Here, if the refinement is requested, we have
+                ! A*U(:,1:K) already computed and stored in Z.
+                ! For the residuals, need Y = A * U(:,1;K) * W.
+                CALL CGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, &
+                           LDW, ZZERO, Y, LDY )              ! BLAS CALL
+                ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC
+                ! This frees Z; Y contains A * U(:,1:K) * W.
+              ELSE
+                ! Compute S = V_k * Sigma_k^(-1) * W, where
+                ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z
+                CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                           W, LDW, ZZERO, S, LDS)
+                ! Then, compute Z = Y * S =
+                ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+                ! = A * U(:,1:K) * W(1:K,1:K)
+                CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                           LDS, ZZERO, Z, LDZ)
+                ! Save a copy of Z into Y and free Z for holding
+                ! the Ritz vectors.
+                CALL CLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+                IF ( WNTEX ) CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB )
+              END IF
+          ELSE IF ( WNTEX ) THEN
+              ! Compute S = V_k * Sigma_k^(-1) * W, where
+                ! V_k * Sigma_k^(-1) is stored in Z
+                CALL CGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                           W, LDW, ZZERO, S, LDS)
+                ! Then, compute Z = Y * S =
+                ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+                ! = A * U(:,1:K) * W(1:K,1:K)
+                CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                           LDS, ZZERO, B, LDB)
+                ! The above call replaces the following two calls
+                ! that were used in the developing-testing phase.
+                ! CALL CGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                !           LDS, ZZERO, Z, LDZ)
+                ! Save a copy of Z into Y and free Z for holding
+                ! the Ritz vectors.
+                ! CALL CLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+!
+          ! Compute the Ritz vectors
+          IF ( WNTVEC ) CALL CGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, &
+                       ZZERO, Z, LDZ )                          ! BLAS CALL
+          ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+          IF ( WNTRES ) THEN
+             DO i = 1, K
+                CALL CAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+                ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! INTRINSIC
+                RES(i) = SCNRM2( M, Y(1,i), 1)                        ! BLAS CALL
+             END DO
+          END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          RWORK(N+1) = XSCL1
+          RWORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE CGEDMD
+
diff --git a/lapack-netlib/SRC/cgedmdq.c b/lapack-netlib/SRC/cgedmdq.c
new file mode 100644
index 000000000..6e3a1faca
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmdq.c
@@ -0,0 +1,1289 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int cgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, 
+	complex *f, integer *ldf, complex *x, integer *ldx, complex *y, 
+	integer *ldy, integer *nrnk, real *tol, integer *k, complex *eigs, 
+	complex *z__, integer *ldz, real *res, complex *b, integer *ldb, 
+	complex *v, integer *ldv, complex *s, integer *lds, complex *zwork, 
+	integer *lzwork, real *work, integer *lwork, integer *iwork, integer *
+	liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    real zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    complex zzero;
+    extern /* Subroutine */ int cgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, complex *, integer *, complex *, 
+	    integer *, integer *, real *, integer *, complex *, complex *, 
+	    integer *, real *, complex *, integer *, complex *, integer *, 
+	    complex *, integer *, complex *, integer *, real *, integer *, 
+	    integer *, integer *, integer *), 
+	    cgeqrf_(integer *, integer *, complex *, integer *, complex *, 
+	    complex *, integer *, integer *), clacpy_(char *, integer *, 
+	    integer *, complex *, integer *, complex *, integer *), 
+	    claset_(char *, integer *, integer *, complex *, complex *, 
+	    complex *, integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int cungqr_(integer *, integer *, integer *, 
+	    complex *, integer *, complex *, complex *, integer *, integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork, mlrwrk, mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    extern /* Subroutine */ int cunmqr_(char *, char *, integer *, integer *, 
+	    integer *, complex *, integer *, complex *, complex *, integer *, 
+	    complex *, integer *, integer *);
+    real one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     CGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, CGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, CGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretised operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the inital QR facorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     unitary matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: CGESVD (the QR SVD algorithm) */
+/*     2 :: CGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by CGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */
+/*     See the description of ZWORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K) of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N-1) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     Z*V, where Z contains orthonormal matrix (the product of */
+/*     Q from the initial QR factorization and the SVD/POD_basis */
+/*     returned by CGEDMD in X) and the second factor (the */
+/*     eigenvectors of the Rayleigh quotient) is in the array V, */
+/*     as returned by CGEDMD. That is,  X(:,1:K)*V(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of V(1:K,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) V contains the K eigenvectors of */
+/*     the Rayleigh quotient. The Ritz vectors */
+/*     (returned in Z) are the product of Q from the initial QR */
+/*     factorization (see the description of F) X (see the */
+/*     description of X) and V. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by CGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     ZWORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by CGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     If the call to CGEDMDQ is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector ZWORK. */
+/*     LZWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for CGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for CGEDMD (see the */
+/*                  description of LWORK in CGEDMD) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    ZUNMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */
+/*         MINMN  = MIN(M,N) */
+/*     Then */
+/*     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */
+/*     is further updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LZWORK = MAX( LZWORK, MINMN+MLWMQR ) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LZWORK = MAX( ZLWORK, MINMN+MLWGQR) */
+
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to CGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is the same as in CGEDMD, because in CGEDMDQ */
+/*     only CGEDMD requires real workspace for snapshots */
+/*     of dimensions MIN(M,N)-by-(N-1). */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP ) */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* .......................................................... */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -21;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -24;
+    } else if (*ldv < *n - 1) {
+	*info = -26;
+    } else if (*lds < *n - 1) {
+	*info = -28;
+    }
+
+    if (wntvec || wntvcf || wntvcq) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.f;
+		work[2] = 2.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlrwrk = 2;
+	mlwork = 2;
+	olwork = 2;
+	iminwr = 1;
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for CGEQRF. */
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwqr;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, &
+		    info1);
+	    olwqr = (integer) zwork[1].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwqr;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	i__1 = *n - 1;
+	cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset]
+		, ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+		s_offset], lds, &zwork[1], lzwork, &work[1], &c_n1, &iwork[1],
+		 liwork, &info1);
+	mlwdmd = (integer) zwork[1].r;
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = (integer) work[1];
+	mlrwrk = f2cmax(i__1,i__2);
+	iminwr = f2cmax(iminwr,iwork[1]);
+	if (lquery) {
+	    olwdmd = (integer) zwork[2].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], 
+			&z__[z_offset], ldz, &zwork[1], &c_n1, &info1);
+		olwmqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &
+			zwork[1], &c_n1, &info1);
+		olwgqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -34;
+	}
+	if (*lwork < mlrwrk && ! lquery) {
+	    *info = -32;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -30;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	zwork[1].r = (real) mlwork, zwork[1].i = 0.f;
+	zwork[2].r = (real) olwork, zwork[2].i = 0.f;
+	work[1] = (real) mlrwrk;
+	work[2] = (real) mlrwrk;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >>N , at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lzwork - minmn;
+    cgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    claset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    clacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    clacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	claset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lzwork - minmn;
+    cgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, &
+	    res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, &
+	    zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. See CGEDMD for details. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    claset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lzwork - minmn;
+	cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by CGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by CGEDMD. */
+	clacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    claset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lzwork - minmn;
+	cunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor R in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to CGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	claset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy);
+	clacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/unitary factor Q in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lzwork - minmn;
+	cungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn 
+		+ 1], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* cgedmdq_ */
+
diff --git a/lapack-netlib/SRC/cgedmdq.f90 b/lapack-netlib/SRC/cgedmdq.f90
new file mode 100644
index 000000000..52c1669c7
--- /dev/null
+++ b/lapack-netlib/SRC/cgedmdq.f90
@@ -0,0 +1,689 @@
+SUBROUTINE CGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  EIGS,         &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, ZWORK, LZWORK, WORK,  LWORK,   &
+                    IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE 
+      INTEGER, PARAMETER :: WP = real32
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LZWORK,  LWORK, LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,   K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*), &
+                                         Z(LDZ,*), B(LDB,*), &
+                                         V(LDV,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP), INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     CGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, CGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, CGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022      
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretised operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z.   
+!            Q is from the inital QR facorization.    
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     unitary matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned. 
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.   
+!     To be useful on exit, this option needs JOBQ='Q'.    
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: CGESVD (the QR SVD algorithm)
+!     2 :: CGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: CGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: CGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F).
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) COMPLEX(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by CGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). 
+!     See the description of ZWORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K) of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X. 
+!.....
+!     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y.   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.      
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N-1) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     Z*V, where Z contains orthonormal matrix (the product of
+!     Q from the initial QR factorization and the SVD/POD_basis
+!     returned by CGEDMD in X) and the second factor (the 
+!     eigenvectors of the Rayleigh quotient) is in the array V, 
+!     as returned by CGEDMD. That is,  X(:,1:K)*V(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns 
+!     of V(1:K,1:K) are the computed eigenvectors of the 
+!     K-by-K Rayleigh quotient.  
+!     See the descriptions of EIGS, X and V.      
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs, 
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.      
+!.....
+!     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization. 
+!     Here A denotes a compression of the underlying operator.      
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....
+!     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) V contains the K eigenvectors of
+!     the Rayleigh quotient. The Ritz vectors
+!     (returned in Z) are the product of Q from the initial QR
+!     factorization (see the description of F) X (see the 
+!     description of X) and V.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by CGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     ZWORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by CGEQRF of the 
+!     M-by-N input matrix F.   
+!     If the call to CGEDMDQ is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length. 
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.      
+!.....      
+!     LZWORK (input) INTEGER
+!     The minimal length of the  workspace vector ZWORK.
+!     LZWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for CGEQRF[M,N])
+!         MLWDMD = minimal workspace for CGEDMD (see the
+!                  description of LWORK in CGEDMD)
+!         MLWMQR = N (minimal workspace for 
+!                    ZUNMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N])
+!         MINMN  = MIN(M,N)      
+!     Then
+!     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD)
+!     is further updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LZWORK = MAX( LZWORK, MINMN+MLWMQR )
+!        if   JOBQ == 'Q' THEN
+!             LZWORK = MAX( ZLWORK, MINMN+MLWGQR)      
+!
+!.....      
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit,
+!     WORK(1:N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to CGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is the same as in CGEDMD, because in CGEDMDQ
+!     only CGEDMD requires real workspace for snapshots
+!     of dimensions MIN(M,N)-by-(N-1).
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then      
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MINMN, MLRWRK,   &
+                           MLWDMD, MLWGQR, MLWMQR, MLWORK,  & 
+                           MLWQR,  OLWDMD, OLWGQR, OLWMQR,  &
+                           OLWORK, OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      CGEQRF, CLACPY, CLASET, CUNGQR, & 
+                    CUNMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      CGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q')
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR.                &
+                                  LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ       &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+           INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+       ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                     THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -21
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -24
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -26
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -28
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+                WORK(1) = 2
+                WORK(2) = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF     
+         
+         MLRWRK = 2
+         MLWORK = 2
+         OLWORK = 2 
+         IMINWR = 1
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for CGEQRF.
+         MLWORK = MAX(MLWORK,MINMN + MLWQR) 
+
+         IF ( LQUERY ) THEN 
+             CALL CGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, &
+                          INFO1 )
+             OLWQR  = INT(ZWORK(1))
+             OLWORK = MAX(OLWORK,MINMN + OLWQR)           
+         END IF
+         CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      EIGS, Z, LDZ, RES,  B, LDB, V, LDV,    & 
+                      S, LDS, ZWORK, LZWORK, WORK, -1, IWORK,&
+                      LIWORK, INFO1 )
+         MLWDMD = INT(ZWORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         MLRWRK = MAX(MLRWRK, INT(WORK(1)))
+         IMINWR = MAX(IMINWR, IWORK(1))
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(ZWORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK, MINMN+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL CUNMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            ZWORK, Z, LDZ, ZWORK, -1, INFO1 )
+               OLWMQR = INT(ZWORK(1))
+               OLWORK = MAX(OLWORK, MINMN+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = MAX(1,N)
+            MLWORK = MAX(MLWORK, MINMN+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, &
+                             ZWORK, -1, INFO1 )               
+                OLWGQR = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK, MINMN+OLWGQR)
+            END IF            
+         END IF          
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34
+         IF ( LWORK  < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'CGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          WORK(1)  = MLRWRK
+          WORK(2)  = MLRWRK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >>N , at this place 
+!     one can use an out of core QRF.
+!   
+      CALL CGEQRF( M, N, F, LDF, ZWORK,                & 
+                   ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL CLASET( 'L', MINMN, N-1, ZZERO,  ZZERO, X, LDX )
+      CALL CLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL CLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL CLASET( 'L', MINMN-2, N-2, ZZERO,  ZZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL CGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, &
+                  N-1,  X, LDX, Y, LDY, NRNK,   TOL, K,    &
+                  EIGS, Z, LDZ, RES, B,  LDB,   V, LDV,    &
+                  S, LDS, ZWORK(MINMN+1), LZWORK-MINMN,    & 
+                  WORK,   LWORK, IWORK, LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code. See CGEDMD for details.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL CLASET( 'A', M-MINMN, K, ZZERO, &
+                                     ZZERO, Z(MINMN+1,1), LDZ )
+        CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z,  &
+             LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by CGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by CGEDMD.
+        CALL CLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL CLASET( 'A', M-N, K, ZZERO, ZZERO, & 
+                                 Z(N+1,1), LDZ )
+        CALL CUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, &
+                    LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor R in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to CGEDMDQ is to be 
+      
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL CLASET( 'A', MINMN, N, ZZERO,  ZZERO, Y, LDY )
+         CALL CLACPY( 'U', MINMN, N, F, LDF,        Y, LDY )
+      END IF    
+!
+!     The orthonormal/unitary factor Q in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN                   ! Q overwrites F 
+         CALL CUNGQR( M, MINMN, MINMN, F, LDF, ZWORK,     &
+                      ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE CGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/cgees.f b/lapack-netlib/SRC/cgees.f
index 71acfdba3..2085dc49b 100644
--- a/lapack-netlib/SRC/cgees.f
+++ b/lapack-netlib/SRC/cgees.f
@@ -189,7 +189,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup gees
 *
 *  =====================================================================
       SUBROUTINE CGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, W, VS,
@@ -230,13 +230,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CGEBAK, CGEBAL, CGEHRD, CHSEQR, CLACPY,
-     $                   CLASCL, CTRSEN, CUNGHR, SLABAD, XERBLA
+     $                   CLASCL, CTRSEN, CUNGHR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -292,7 +292,7 @@
                MAXWRK = MAX( MAXWRK, HSWORK )
             END IF
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -12
@@ -318,7 +318,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -413,7 +412,7 @@
          CALL CCOPY( N, A, LDA+1, W, 1 )
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGEES
diff --git a/lapack-netlib/SRC/cgeesx.f b/lapack-netlib/SRC/cgeesx.f
index 782e36747..036ae90c2 100644
--- a/lapack-netlib/SRC/cgeesx.f
+++ b/lapack-netlib/SRC/cgeesx.f
@@ -230,7 +230,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup geesx
 *
 *  =====================================================================
       SUBROUTINE CGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM, W,
@@ -274,13 +274,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CGEBAK, CGEBAL, CGEHRD, CHSEQR, CLACPY,
-     $                   CLASCL, CTRSEN, CUNGHR, SLABAD, SLASCL, XERBLA
+     $                   CLASCL, CTRSEN, CUNGHR, SLASCL, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -350,7 +350,7 @@
             IF( .NOT.WANTSN )
      $         LWRK = MAX( LWRK, ( N*N )/2 )
          END IF
-         WORK( 1 ) = LWRK
+         WORK( 1 ) = SROUNDUP_LWORK(LWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -15
@@ -376,7 +376,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -488,7 +487,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGEESX
diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f
index a77525ef8..bb41599d1 100644
--- a/lapack-netlib/SRC/cgeev.f
+++ b/lapack-netlib/SRC/cgeev.f
@@ -172,7 +172,7 @@
 *
 *  @generated from zgeev.f, fortran z -> c, Tue Apr 19 01:47:44 2016
 *
-*> \ingroup complexGEeigen
+*> \ingroup geev
 *
 *  =====================================================================
       SUBROUTINE CGEEV( JOBVL, JOBVR, N, A, LDA, W, VL, LDVL, VR, LDVR,
@@ -212,14 +212,15 @@
       REAL   DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, XERBLA, CSSCAL, CGEBAK, CGEBAL, CGEHRD,
+      EXTERNAL           XERBLA, CSSCAL, CGEBAK, CGEBAL, CGEHRD,
      $                   CHSEQR, CLACPY, CLASCL, CSCAL, CTREVC3, CUNGHR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX, ILAENV
-      REAL   SLAMCH, SCNRM2, CLANGE
-      EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE
+      REAL               SLAMCH, SCNRM2, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL, CMPLX, CONJG, AIMAG, MAX, SQRT
@@ -291,7 +292,7 @@
             HSWORK = INT( WORK(1) )
             MAXWRK = MAX( MAXWRK, HSWORK, MINWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -12
@@ -315,7 +316,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -493,7 +493,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGEEV
diff --git a/lapack-netlib/SRC/cgeevx.f b/lapack-netlib/SRC/cgeevx.f
index 2388f5acc..5dbc394e9 100644
--- a/lapack-netlib/SRC/cgeevx.f
+++ b/lapack-netlib/SRC/cgeevx.f
@@ -279,7 +279,7 @@
 *
 *  @generated from zgeevx.f, fortran z -> c, Tue Apr 19 01:47:44 2016
 *
-*> \ingroup complexGEeigen
+*> \ingroup geevx
 *
 *  =====================================================================
       SUBROUTINE CGEEVX( BALANC, JOBVL, JOBVR, SENSE, N, A, LDA, W, VL,
@@ -323,15 +323,16 @@
       REAL   DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLASCL, XERBLA, CSSCAL, CGEBAK, CGEBAL,
+      EXTERNAL           SLASCL, XERBLA, CSSCAL, CGEBAK, CGEBAL,
      $                   CGEHRD, CHSEQR, CLACPY, CLASCL, CSCAL, CTREVC3,
      $                   CTRSNA, CUNGHR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX, ILAENV
-      REAL   SLAMCH, SCNRM2, CLANGE
-      EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE
+      REAL               SLAMCH, SCNRM2, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SCNRM2, CLANGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL, CMPLX, CONJG, AIMAG, MAX, SQRT
@@ -434,7 +435,7 @@
             END IF
             MAXWRK = MAX( MAXWRK, MINWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -458,7 +459,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -657,7 +657,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGEEVX
diff --git a/lapack-netlib/SRC/cgehrd.f b/lapack-netlib/SRC/cgehrd.f
index d9c050267..7ba87cc01 100644
--- a/lapack-netlib/SRC/cgehrd.f
+++ b/lapack-netlib/SRC/cgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -201,7 +201,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -221,13 +222,19 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *        Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -248,7 +255,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -268,7 +274,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
@@ -344,7 +350,8 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL CGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
-      WORK( 1 ) = LWKOPT
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f
index 062ac182b..1fc75613e 100644
--- a/lapack-netlib/SRC/cgejsv.f
+++ b/lapack-netlib/SRC/cgejsv.f
@@ -52,10 +52,10 @@
 *> are computed and stored in the arrays U and V, respectively. The diagonal
 *> of [SIGMA] is computed and stored in the array SVA.
 *> \endverbatim
-*>
-*>  Arguments:
-*>  ==========
-*>
+*
+*  Arguments:
+*  ==========
+*
 *> \param[in] JOBA
 *> \verbatim
 *>          JOBA is CHARACTER*1
@@ -151,7 +151,7 @@
 *>         transposed A if A^* seems to be better with respect to convergence.
 *>         If the matrix is not square, JOBT is ignored.
 *>         The decision is based on two values of entropy over the adjoint
-*>         orbit of A^* * A. See the descriptions of WORK(6) and WORK(7).
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
 *>       = 'T': transpose if entropy test indicates possibly faster
 *>         convergence of Jacobi process if A^* is taken as input. If A is
 *>         replaced with A^*, then the row pivoting is included automatically.
@@ -209,11 +209,11 @@
 *> \verbatim
 *>          SVA is REAL array, dimension (N)
 *>          On exit,
-*>          - For WORK(1)/WORK(2) = ONE: The singular values of A. During the
-*>            computation SVA contains Euclidean column norms of the
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
 *>            iterated matrices in the array A.
-*>          - For WORK(1) .NE. WORK(2): The singular values of A are
-*>            (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
 *>            sigma_max(A) overflows or if small singular values have been
 *>            saved from underflow by scaling the input matrix A.
 *>          - If JOBR='R' then some of the singular values may be returned
@@ -252,7 +252,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^*. In that case, [U] is computed
 *>                         in V as right singular vectors of A^* and then
 *>                         copied back to the U array. This 'W' option is just
@@ -1819,7 +1819,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1842,7 +1842,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1886,7 +1886,7 @@
             ELSE IF ( CONDR2 .LT. COND_OK ) THEN
 *
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^* * V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f
index d66033166..24aaa982e 100644
--- a/lapack-netlib/SRC/cgelq.f
+++ b/lapack-netlib/SRC/cgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -190,7 +192,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGELQT, CLASWLQ, XERBLA
@@ -292,9 +295,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = LWMIN
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = LWREQ
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -319,7 +322,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LWREQ
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f
index 37ef13a27..3847a958a 100644
--- a/lapack-netlib/SRC/cgelqf.f
+++ b/lapack-netlib/SRC/cgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -118,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -167,16 +168,16 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'CGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -184,19 +185,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -266,7 +273,7 @@
      $   CALL CGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGELQF
diff --git a/lapack-netlib/SRC/cgelqt3.f b/lapack-netlib/SRC/cgelqt3.f
index 1dfbd3f2b..553087bf4 100644
--- a/lapack-netlib/SRC/cgelqt3.f
+++ b/lapack-netlib/SRC/cgelqt3.f
@@ -159,7 +159,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL CLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL CLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1 ) )
          T(1,1)=CONJG(T(1,1))
 *
       ELSE
diff --git a/lapack-netlib/SRC/cgelsd.f b/lapack-netlib/SRC/cgelsd.f
index fce4ca6e2..5d7eec68d 100644
--- a/lapack-netlib/SRC/cgelsd.f
+++ b/lapack-netlib/SRC/cgelsd.f
@@ -60,12 +60,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -210,7 +204,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup gelsd
 *
 *> \par Contributors:
 *  ==================
@@ -255,13 +249,13 @@
 *     .. External Subroutines ..
       EXTERNAL           CGEBRD, CGELQF, CGEQRF, CLACPY,
      $                   CLALSD, CLASCL, CLASET, CUNMBR,
-     $                   CUNMLQ, CUNMQR, SLABAD, SLASCL,
+     $                   CUNMLQ, CUNMQR, SLASCL,
      $                   SLASET, XERBLA
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           CLANGE, SLAMCH, ILAENV
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           CLANGE, SLAMCH, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, LOG, MAX, MIN, REAL
@@ -373,7 +367,7 @@
             END IF
          END IF
          MINWRK = MIN( MINWRK, MAXWRK )
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
          IWORK( 1 ) = LIWORK
          RWORK( 1 ) = LRWORK
 *
@@ -402,7 +396,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max entry outside range [SMLNUM,BIGNUM].
 *
@@ -653,7 +646,7 @@
       END IF
 *
    10 CONTINUE
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IWORK( 1 ) = LIWORK
       RWORK( 1 ) = LRWORK
       RETURN
diff --git a/lapack-netlib/SRC/cgelss.f b/lapack-netlib/SRC/cgelss.f
index da6b9092f..00d7f596a 100644
--- a/lapack-netlib/SRC/cgelss.f
+++ b/lapack-netlib/SRC/cgelss.f
@@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE CGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -214,13 +214,12 @@
 *     .. External Subroutines ..
       EXTERNAL           CBDSQR, CCOPY, CGEBRD, CGELQF, CGEMM, CGEMV,
      $                   CGEQRF, CLACPY, CLASCL, CLASET, CSRSCL, CUNGBR,
-     $                   CUNMBR, CUNMLQ, CUNMQR, SLABAD, SLASCL, SLASET,
-     $                   XERBLA
+     $                   CUNMBR, CUNMLQ, CUNMQR, SLASCL, SLASET, XERBLA
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -362,7 +361,7 @@
             END IF
             MAXWRK = MAX( MINWRK, MAXWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY )
      $      INFO = -12
@@ -388,7 +387,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -540,7 +538,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL CLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL CGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL CCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -645,7 +643,7 @@
                CALL CLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL CGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, CZERO, WORK( IWORK ), 1 )
             CALL CCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -737,7 +735,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL CLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL CGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL CCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -760,7 +758,7 @@
          CALL CLASCL( 'G', 0, 0, BIGNUM, BNRM, N, NRHS, B, LDB, INFO )
       END IF
    70 CONTINUE
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGELSS
diff --git a/lapack-netlib/SRC/cgelst.f b/lapack-netlib/SRC/cgelst.f
index 7d8e44ddf..b69626934 100644
--- a/lapack-netlib/SRC/cgelst.f
+++ b/lapack-netlib/SRC/cgelst.f
@@ -176,7 +176,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup gelst
 *
 *> \par Contributors:
 *  ==================
@@ -224,15 +224,15 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE
+      REAL               SLAMCH, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGELQT, CGEQRT, CGEMLQT, CGEMQRT, SLABAD,
+      EXTERNAL           CGELQT, CGEQRT, CGEMLQT, CGEMQRT,
      $                   CLASCL, CLASET, CTRTRS, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -270,7 +270,7 @@
 *
          MNNRHS = MAX( MN, NRHS )
          LWOPT = MAX( 1, (MN+MNNRHS)*NB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
 *
       END IF
 *
@@ -285,7 +285,7 @@
 *
       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
          CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
          RETURN
       END IF
 *
@@ -311,7 +311,6 @@
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -334,7 +333,7 @@
 *        Matrix all zero. Return zero solution.
 *
          CALL CLASET( 'Full', MAX( M, N ), NRHS, CZERO, CZERO, B, LDB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
          RETURN
       END IF
 *
@@ -524,7 +523,7 @@
      $                INFO )
       END IF
 *
-      WORK( 1 ) = REAL( LWOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgelsy.f b/lapack-netlib/SRC/cgelsy.f
index 67140f191..64bb10023 100644
--- a/lapack-netlib/SRC/cgelsy.f
+++ b/lapack-netlib/SRC/cgelsy.f
@@ -116,6 +116,7 @@
 *>          B is COMPLEX array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -148,6 +149,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f
index e0cf78bc0..e5b02b669 100644
--- a/lapack-netlib/SRC/cgemlq.f
+++ b/lapack-netlib/SRC/cgemlq.f
@@ -110,16 +110,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -143,7 +144,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -159,11 +160,13 @@
 *>  block sizes MB and NB returned by ILAENV, CGELQ will use either
 *>  CLASWLQ (if the matrix is wide-and-short) or CGELQT to compute
 *>  the LQ factorization.
-*>  This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to 
+*>  This version of CGEMLQ will use either CLAMSWLQ or CGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in CLAMSWLQ or CGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE CGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -185,11 +188,12 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLAMSWLQ, CGEMLQT, XERBLA
@@ -201,7 +205,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -216,6 +220,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -244,12 +255,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = REAL( LW )
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -261,7 +272,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -274,7 +285,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = REAL( LW )
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f
index ea9de146e..0b7dd9dd7 100644
--- a/lapack-netlib/SRC/cgemqr.f
+++ b/lapack-netlib/SRC/cgemqr.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -166,6 +167,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE CGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,11 +190,12 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMQRT, CLAMTSQR, XERBLA
@@ -203,7 +207,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +222,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -251,7 +262,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +274,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +287,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgeqlf.f b/lapack-netlib/SRC/cgeqlf.f
index d2c11c269..6c67344c5 100644
--- a/lapack-netlib/SRC/cgeqlf.f
+++ b/lapack-netlib/SRC/cgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -113,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -162,7 +163,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -186,10 +188,11 @@
             NB = ILAENV( 1, 'CGEQLF', ' ', M, N, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
@@ -276,7 +279,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL CGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGEQLF
diff --git a/lapack-netlib/SRC/cgeqp3rk.c b/lapack-netlib/SRC/cgeqp3rk.c
new file mode 100644
index 000000000..54e7fb140
--- /dev/null
+++ b/lapack-netlib/SRC/cgeqp3rk.c
@@ -0,0 +1,1071 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int cgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, real *abstol, real *reltol, complex *a, integer *lda, integer *
+	k, real *maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, complex *tau, 
+	complex *work, integer *lwork, real *rwork, integer *iwork, integer *
+	info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    extern /* Subroutine */ int claqp3rk_(integer *, integer *, integer *, 
+	    integer *, integer *, real *, real *, integer *, real *, complex *
+	    , integer *, logical *, integer *, real *, real *, integer *, 
+	    complex *, real *, real *, complex *, complex *, integer *, 
+	    integer *, integer *);
+    real maxc2nrm;
+    logical done;
+    integer jmax, j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    real myhugeval;
+    extern real scnrm2_(integer *, complex *, integer *);
+    integer jb, nb, kf, nx;
+    extern real slamch_(char *);
+    real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen), isamax_(integer *, real *, 
+	    integer *);
+    extern logical sisnan_(real *);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    real eps;
+    integer iws, ioffset;
+    extern /* Subroutine */ int claqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, real *, real *, integer *, real *, complex *
+	    , integer *, integer *, real *, real *, integer *, complex *, 
+	    real *, real *, complex *, integer *);
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (sisnan_(abstol)) {
+	*info = -5;
+    } else if (sisnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in CLAQP2RK. */
+/*           1) CLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in CLARF subroutine inside CLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "CGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in CLAQP2RK and blocked BLAS 3 code */
+/*           in CLAQP3RK. */
+/*           1) CGEQP3RK, CLAQP2RK, CLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) CLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in CLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) CLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) CLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("CGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	rwork[j] = scnrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	rwork[*n + j] = rwork[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = isamax_(n, &rwork[1], &c__1);
+
+/*     ==================================================================. */
+
+    if (sisnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.f) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = slamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = slamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.f) {
+	safmin = slamch_("Safe minimum");
+/* Computing MAX */
+	r__1 = *abstol, r__2 = safmin * 2.f;
+	*abstol = f2cmax(r__1,r__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.f) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1.f <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+
+	q__1.r = (real) lwkopt, q__1.i = 0.f;
+	work[1].r = q__1.r, work[1].i = q__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "CGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "CGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    claqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &
+		    rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], 
+		    &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		q__1.r = (real) lwkopt, q__1.i = 0.f;
+		work[1].r = q__1.r, work[1].i = q__1.i;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	claqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], &
+		work[1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + isamax_(&i__1, &rwork[*k + 1], &c__1);
+	    *maxc2nrmk = rwork[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.f;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		i__2 = j;
+		tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	    }
+
+	} else {
+	    *maxc2nrmk = 0.f;
+	    *relmaxc2nrmk = 0.f;
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    q__1.r = (real) lwkopt, q__1.i = 0.f;
+    work[1].r = q__1.r, work[1].i = q__1.i;
+
+    return 0;
+
+/*     End of CGEQP3RK */
+
+} /* cgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/cgeqp3rk.f b/lapack-netlib/SRC/cgeqp3rk.f
new file mode 100644
index 000000000..731c44edb
--- /dev/null
+++ b/lapack-netlib/SRC/cgeqp3rk.f
@@ -0,0 +1,1093 @@
+*> \brief \b CGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, RWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       REAL               ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       REAL               RWORK( * )
+*       COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a complex
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M unitary matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS
+*> matrix B with  Q(K)**H * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**H * B, the matrix B with the unitary
+*>        transformation Q(K)**H applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M unitary matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**H,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a complex scalar,
+*>   v    is a complex vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = DLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 unitary matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**H * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= N+NRHS-1, otherwise.
+*>          For optimal performance LWORK >= NB*( N+NRHS+1 ),
+*>          where NB is the optimal block size for CGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine CLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> CGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in CGEQP3 routine which uses
+*> CLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, RWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      REAL               ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               RWORK( * )
+      COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      REAL               ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+      COMPLEX            CZERO
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      REAL               EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLAQP2RK, CLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX, ILAENV
+      REAL               SLAMCH, SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SISNAN, SLAMCH, SCNRM2, ISAMAX, ILAENV,
+     $                   SROUNDUP_LWORK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( SISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( SISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in CLAQP2RK.
+*           1) CLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in CLARF subroutine inside CLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'CGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in CLAQP2RK and blocked BLAS 3 code
+*           in CLAQP3RK.
+*           1) CGEQP3RK, CLAQP2RK, CLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) CLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in CLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) CLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) CLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         RWORK( J ) = SCNRM2( M, A( 1, J ), 1 )
+         RWORK( N+J ) = RWORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = ISAMAX( N, RWORK( 1 ), 1 )
+*
+*     ==================================================================.
+*
+      IF( SISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = SLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = SLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'CGEQP3RK', ' ', M, N, -1, -1 ) )
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'CGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL CLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     RWORK( J ), RWORK( N+J ),
+     $                     WORK( 1 ), WORK( JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL CLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), RWORK( J ), RWORK( N+J ),
+     $                  WORK( 1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + ISAMAX( N-K, RWORK( K+1 ), 1 )
+            MAXC2NRMK = RWORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = CZERO
+            END DO
+*
+         ELSE
+            MAXC2NRMK = ZERO
+            RELMAXC2NRMK = ZERO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+      RETURN
+*
+*     End of CGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f
index d10e3da65..3617594d0 100644
--- a/lapack-netlib/SRC/cgeqr.f
+++ b/lapack-netlib/SRC/cgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,11 +190,12 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLATSQR, CGEQRT, XERBLA
@@ -244,8 +247,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +258,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +273,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +287,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +314,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgeqrf.f b/lapack-netlib/SRC/cgeqrf.f
index d71bd5b33..bf22a2cd3 100644
--- a/lapack-netlib/SRC/cgeqrf.f
+++ b/lapack-netlib/SRC/cgeqrf.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup geqrf
 *
 *> \par Further Details:
 *  =====================
@@ -170,7 +170,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -199,7 +200,7 @@
          ELSE
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -274,7 +275,7 @@
      $   CALL CGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGEQRF
diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f
index 995404f43..5b6226c67 100644
--- a/lapack-netlib/SRC/cgeqrfp.f
+++ b/lapack-netlib/SRC/cgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -122,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQR2P, CLARFB, CLARFT, XERBLA
@@ -173,7 +174,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,8 +183,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = LWKOPT
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -190,7 +200,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -202,7 +212,6 @@
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -210,7 +219,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
@@ -272,7 +281,7 @@
      $   CALL CGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGEQRFP
diff --git a/lapack-netlib/SRC/cgerqf.f b/lapack-netlib/SRC/cgerqf.f
index d2247844c..6f914c892 100644
--- a/lapack-netlib/SRC/cgerqf.f
+++ b/lapack-netlib/SRC/cgerqf.f
@@ -114,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gerqf
 *
 *> \par Further Details:
 *  =====================
@@ -163,7 +163,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -187,7 +188,7 @@
             NB = ILAENV( 1, 'CGERQF', ' ', M, N, -1, -1 )
             LWKOPT = M*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF ( .NOT.LQUERY ) THEN
             IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
@@ -278,7 +279,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL CGERQ2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CGERQF
diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f
index b824374d3..1838629ae 100644
--- a/lapack-netlib/SRC/cgesdd.f
+++ b/lapack-netlib/SRC/cgesdd.f
@@ -53,12 +53,6 @@
 *>
 *> Note that the routine returns VT = V**H, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cgesvd.f b/lapack-netlib/SRC/cgesvd.f
index 239b13431..6165a6acf 100644
--- a/lapack-netlib/SRC/cgesvd.f
+++ b/lapack-netlib/SRC/cgesvd.f
@@ -206,7 +206,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsing
+*> \ingroup gesvd
 *
 *  =====================================================================
       SUBROUTINE CGESVD( JOBU, JOBVT, M, N, A, LDA, S, U, LDU, VT, LDVT,
@@ -259,8 +259,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, SQRT
@@ -615,7 +615,7 @@
             END IF
          END IF
          MAXWRK = MAX( MINWRK, MAXWRK )
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -13
@@ -3694,7 +3694,7 @@
 *
 *     Return optimal workspace in WORK(1)
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgesvdq.f b/lapack-netlib/SRC/cgesvdq.f
index 1a587eca3..95091c2c0 100644
--- a/lapack-netlib/SRC/cgesvdq.f
+++ b/lapack-netlib/SRC/cgesvdq.f
@@ -363,7 +363,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/cgesvdx.f b/lapack-netlib/SRC/cgesvdx.f
index fbdb121ca..e1856a65f 100644
--- a/lapack-netlib/SRC/cgesvdx.f
+++ b/lapack-netlib/SRC/cgesvdx.f
@@ -208,7 +208,7 @@
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
-*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK;
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
@@ -261,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsing
+*> \ingroup gesvdx
 *
 *  =====================================================================
       SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU,
@@ -312,8 +312,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE
+      REAL               SLAMCH, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, SQRT
@@ -448,7 +448,7 @@
             END IF
          END IF
          MAXWRK = MAX( MAXWRK, MINWRK )
-         WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO )
+         WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -19
@@ -464,7 +464,7 @@
 *
 *     Quick return if possible
 *
-      IF( M.EQ.0 .OR. N.EQ.0 ) THEN
+      IF( MINMN.EQ.0 ) THEN
          RETURN
       END IF
 *
@@ -846,7 +846,7 @@
 *
 *     Return optimal workspace in WORK(1)
 *
-      WORK( 1 ) = CMPLX( REAL( MAXWRK ), ZERO )
+      WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f
index 149cf5e48..b9c8f1709 100644
--- a/lapack-netlib/SRC/cgesvj.f
+++ b/lapack-netlib/SRC/cgesvj.f
@@ -208,15 +208,17 @@
 *> \verbatim
 *>          CWORK is COMPLEX array, dimension (max(1,LWORK))
 *>          Used as workspace.
-*>          If on entry LWORK = -1, then a workspace query is assumed and
-*>          no computation is done; CWORK(1) is set to the minial (and optimal)
-*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER.
-*>          Length of CWORK, LWORK >= M+N.
+*>          Length of CWORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; CWORK(1) is set to the minial (and optimal)
+*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in,out] RWORK
@@ -247,15 +249,17 @@
 *>          RWORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
-*>         If on entry LRWORK = -1, then a workspace query is assumed and
-*>         no computation is done; RWORK(1) is set to the minial (and optimal)
-*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[in] LRWORK
 *> \verbatim
 *>         LRWORK is INTEGER
-*>         Length of RWORK, LRWORK >= MAX(6,N).
+*>         Length of RWORK.
+*>         LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise
+*>
+*>         If on entry LRWORK = -1, then a workspace query is assumed and
+*>         no computation is done; RWORK(1) is set to the minial (and optimal)
+*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -276,7 +280,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -374,16 +378,17 @@
       PARAMETER  ( NSWEEP = 30 )
 *     ..
 *     .. Local Scalars ..
-      COMPLEX AAPQ, OMPQ
-      REAL    AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
-     $        BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
-     $        MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
-     $        SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL
-      INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
-     $        ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
-     $        N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND
-      LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK,
-     $        RSVEC, UCTOL, UPPER
+      COMPLEX    AAPQ, OMPQ
+      REAL       AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
+     $           BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
+     $           MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
+     $           SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL
+      INTEGER    BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
+     $           ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
+     $           N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND,
+     $           MINMN, LWMIN, LRWMIN
+      LOGICAL    APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK,
+     $           RSVEC, UCTOL, UPPER
 *     ..
 *     ..
 *     .. Intrinsic Functions ..
@@ -398,8 +403,8 @@
       INTEGER            ISAMAX
       EXTERNAL           ISAMAX
 *     from LAPACK
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
       LOGICAL            LSAME
       EXTERNAL           LSAME
 *     ..
@@ -422,7 +427,16 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
-      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+      ELSE
+         LWMIN  = M + N
+         LRWMIN = MAX( 6, N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -442,9 +456,9 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( LWORK.LT.( M+N ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
-      ELSE IF( LRWORK.LT.MAX( N, 6 ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -15
       ELSE
          INFO = 0
@@ -454,15 +468,15 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CGESVJ', -INFO )
          RETURN
-      ELSE IF ( LQUERY ) THEN
-         CWORK(1) = M + N
-         RWORK(1) = MAX( N, 6 )
+      ELSE IF( LQUERY ) THEN
+         CWORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+         RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
          RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/cgetf2.f b/lapack-netlib/SRC/cgetf2.f
index aac989970..995ee40ec 100644
--- a/lapack-netlib/SRC/cgetf2.f
+++ b/lapack-netlib/SRC/cgetf2.f
@@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
       SUBROUTINE CGETF2( M, N, A, LDA, IPIV, INFO )
@@ -126,16 +126,14 @@
      $                   ZERO = ( 0.0E+0, 0.0E+0 ) )
 *     ..
 *     .. Local Scalars ..
-      REAL               SFMIN
-      INTEGER            I, J, JP
+      INTEGER            J, JP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
       INTEGER            ICAMAX
-      EXTERNAL           SLAMCH, ICAMAX
+      EXTERNAL           ICAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGERU, CSCAL, CSWAP, XERBLA
+      EXTERNAL           CGERU, CRSCL, CSWAP, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -161,10 +159,6 @@
 *
       IF( M.EQ.0 .OR. N.EQ.0 )
      $   RETURN
-*
-*     Compute machine safe minimum
-*
-      SFMIN = SLAMCH('S')
 *
       DO 10 J = 1, MIN( M, N )
 *
@@ -181,15 +175,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
-                  CALL CSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
-               ELSE
-                  DO 20 I = 1, M-J
-                     A( J+I, J ) = A( J+I, J ) / A( J, J )
-   20             CONTINUE
-               END IF
-            END IF
+            IF( J.LT.M )
+     $         CALL CRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
 *
          ELSE IF( INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/cgetri.f b/lapack-netlib/SRC/cgetri.f
index bd7fc286c..2eb3da7ab 100644
--- a/lapack-netlib/SRC/cgetri.f
+++ b/lapack-netlib/SRC/cgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE CGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -138,7 +138,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMM, CGEMV, CSWAP, CTRSM, CTRTRI, XERBLA
@@ -152,8 +153,8 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = LWKOPT
+      LWKOPT = MAX( 1, N*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -251,7 +252,7 @@
      $      CALL CSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 )
    60 CONTINUE
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of CGETRI
diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f
index 8a4d02224..3f43dc8de 100644
--- a/lapack-netlib/SRC/cgetsls.f
+++ b/lapack-netlib/SRC/cgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE CGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -191,15 +191,15 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, CLANGE
-      EXTERNAL           LSAME, SLABAD, SLAMCH, CLANGE
+      REAL               SLAMCH, CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQR, CGEMQR, CLASCL, CLASET,
      $                   CTRTRS, XERBLA, CGELQ, CGEMLQ
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN, INT
+      INTRINSIC          MAX, MIN, INT
 *     ..
 *     .. Executable Statements ..
 *
@@ -229,7 +229,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEO = 1
+         WSIZEM = 1
+       ELSE IF ( M.GE.N ) THEN
          CALL CGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
@@ -265,7 +268,7 @@
           INFO = -10
        END IF
 *
-       WORK( 1 ) = REAL( WSIZEO )
+       WORK( 1 ) = SROUNDUP_LWORK( WSIZEO )
 *
       END IF
 *
@@ -274,7 +277,7 @@
         RETURN
       END IF
       IF( LQUERY ) THEN
-        IF( LWORK.EQ.-2 ) WORK( 1 ) = REAL( WSIZEM )
+        IF( LWORK.EQ.-2 ) WORK( 1 ) = SROUNDUP_LWORK( WSIZEM )
         RETURN
       END IF
       IF( LWORK.LT.WSIZEO ) THEN
@@ -297,7 +300,6 @@
 *
        SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -485,7 +487,7 @@
       END IF
 *
    50 CONTINUE
-      WORK( 1 ) = REAL( TSZO + LWO )
+      WORK( 1 ) = SROUNDUP_LWORK( TSZO + LWO )
       RETURN
 *
 *     End of CGETSLS
diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f
index 4e4dc1d4a..087e9bc7f 100644
--- a/lapack-netlib/SRC/cgetsqrhrt.f
+++ b/lapack-netlib/SRC/cgetsqrhrt.f
@@ -131,13 +131,15 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +162,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup comlpexOTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -200,6 +202,10 @@
       INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
      $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL,
      $                   XERBLA
@@ -212,7 +218,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +231,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +269,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
@@ -277,14 +284,14 @@
          CALL XERBLA( 'CGETSQRHRT', -INFO )
          RETURN
       ELSE IF ( LQUERY ) THEN
-         WORK( 1 ) = CMPLX( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
       IF( MIN( M, N ).EQ.0 ) THEN
-         WORK( 1 ) = CMPLX( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
@@ -341,9 +348,9 @@
          END IF
       END DO
 *
-      WORK( 1 ) = CMPLX( LWORKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
       RETURN
 *
 *     End of CGETSQRHRT
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/cgges.f b/lapack-netlib/SRC/cgges.f
index c54174da4..0ff848735 100644
--- a/lapack-netlib/SRC/cgges.f
+++ b/lapack-netlib/SRC/cgges.f
@@ -261,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup gges
 *
 *  =====================================================================
       SUBROUTINE CGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB,
@@ -312,14 +312,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY,
-     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -390,7 +389,7 @@
             LWKOPT = MAX( LWKOPT, N +
      $                    N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, -1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -18
@@ -415,7 +414,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -587,7 +585,7 @@
 *
    30 CONTINUE
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgges3.f b/lapack-netlib/SRC/cgges3.f
index aac9f9510..c1ca79688 100644
--- a/lapack-netlib/SRC/cgges3.f
+++ b/lapack-netlib/SRC/cgges3.f
@@ -215,7 +215,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -260,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE CGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -300,7 +301,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT
+     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SMLNUM
 *     ..
@@ -310,13 +312,12 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY,
-     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -353,6 +354,8 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -369,7 +372,7 @@
          INFO = -14
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -16
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
@@ -377,29 +380,33 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1,  N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, N + INT( WORK( 1 ) ) )
          CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          IF( ILVSL ) THEN
             CALL CUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1,
      $                   IERR )
-            LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          END IF
          CALL CGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL,
      $                LDVSL, VSR, LDVSR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          CALL CLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB,
      $                ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1,
      $                RWORK, 0, IERR )
-         LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          IF( WANTST ) THEN
             CALL CTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB,
      $                   ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM,
      $                   PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR )
-            LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          END IF
-         WORK( 1 ) = CMPLX( LWKOPT )
       END IF
 
 *
@@ -422,7 +429,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -585,7 +591,7 @@
 *
    30 CONTINUE
 *
-      WORK( 1 ) = CMPLX( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggesx.f b/lapack-netlib/SRC/cggesx.f
index 6385a74c1..3bf460fac 100644
--- a/lapack-netlib/SRC/cggesx.f
+++ b/lapack-netlib/SRC/cggesx.f
@@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup ggesx
 *
 *  =====================================================================
       SUBROUTINE CGGESX( JOBVSL, JOBVSR, SORT, SELCTG, SENSE, N, A, LDA,
@@ -373,14 +373,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY,
-     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGSEN, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -476,7 +475,7 @@
             MAXWRK = 1
             LWRK   = 1
          END IF
-         WORK( 1 ) = LWRK
+         WORK( 1 ) = SROUNDUP_LWORK(LWRK)
          IF( WANTSN .OR. N.EQ.0 ) THEN
             LIWMIN = 1
          ELSE
@@ -510,7 +509,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -705,7 +703,7 @@
 *
    40 CONTINUE
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/cggev.f b/lapack-netlib/SRC/cggev.f
index c1c28a180..cf16e3079 100644
--- a/lapack-netlib/SRC/cggev.f
+++ b/lapack-netlib/SRC/cggev.f
@@ -209,7 +209,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup ggev
 *
 *  =====================================================================
       SUBROUTINE CGGEV( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA,
@@ -254,14 +254,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY,
-     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, MAX, REAL, SQRT
@@ -336,7 +335,7 @@
             LWKOPT = MAX( LWKOPT, N +
      $                 N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, -1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -15
@@ -359,7 +358,6 @@
       EPS = SLAMCH( 'E' )*SLAMCH( 'B' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -547,7 +545,7 @@
       IF( ILBSCL )
      $   CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CGGEV
diff --git a/lapack-netlib/SRC/cggev3.f b/lapack-netlib/SRC/cggev3.f
index 9483ecdeb..d2b75aebc 100644
--- a/lapack-netlib/SRC/cggev3.f
+++ b/lapack-netlib/SRC/cggev3.f
@@ -174,7 +174,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -208,7 +209,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE CGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA,
@@ -243,7 +244,7 @@
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
      $                   IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR,
-     $                   LWKOPT
+     $                   LWKOPT, LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
       COMPLEX            X
@@ -253,13 +254,12 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHD3, CLAQZ0, CLACPY,
-     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, SLABAD,
-     $                   XERBLA
+     $                   CLASCL, CLASET, CTGEVC, CUNGQR, CUNMQR, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, MAX, REAL, SQRT
@@ -301,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -315,7 +316,7 @@
          INFO = -11
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -15
       END IF
 *
@@ -323,7 +324,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL CGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( N,  N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) )
          CALL CUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
          LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
@@ -348,7 +349,11 @@
      $                   RWORK, 0, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = CMPLX( LWKOPT )
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -368,7 +373,6 @@
       EPS = SLAMCH( 'E' )*SLAMCH( 'B' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -549,7 +553,7 @@
       IF( ILBSCL )
      $   CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
 *
-      WORK( 1 ) = CMPLX( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CGGEV3
diff --git a/lapack-netlib/SRC/cggevx.f b/lapack-netlib/SRC/cggevx.f
index 405c9c3b5..fa4e92682 100644
--- a/lapack-netlib/SRC/cggevx.f
+++ b/lapack-netlib/SRC/cggevx.f
@@ -335,7 +335,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEeigen
+*> \ingroup ggevx
 *
 *> \par Further Details:
 *  =====================
@@ -416,13 +416,13 @@
 *     .. External Subroutines ..
       EXTERNAL           CGEQRF, CGGBAK, CGGBAL, CGGHRD, CHGEQZ, CLACPY,
      $                   CLASCL, CLASET, CTGEVC, CTGSNA, CUNGQR, CUNMQR,
-     $                   SLABAD, SLASCL, XERBLA
+     $                   SLASCL, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANGE, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH
+      REAL               CLANGE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANGE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, MAX, REAL, SQRT
@@ -521,7 +521,7 @@
      $                       N*ILAENV( 1, 'CUNGQR', ' ', N, 1, N, 0 ) )
             END IF
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -25
@@ -545,7 +545,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -793,7 +792,7 @@
       IF( ILBSCL )
      $   CALL CLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of CGGEVX
diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f
index fb384b651..0d36deca6 100644
--- a/lapack-netlib/SRC/cggglm.f
+++ b/lapack-netlib/SRC/cggglm.f
@@ -177,7 +177,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup ggglm
 *
 *  =====================================================================
       SUBROUTINE CGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK,
@@ -213,7 +213,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -252,7 +253,7 @@
             LWKMIN = M + N + P
             LWKOPT = M + NP + MAX( N, P )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
diff --git a/lapack-netlib/SRC/cgghd3.f b/lapack-netlib/SRC/cgghd3.f
index 1074b4828..f7175a72c 100644
--- a/lapack-netlib/SRC/cgghd3.f
+++ b/lapack-netlib/SRC/cgghd3.f
@@ -180,14 +180,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in]  LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -212,7 +212,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -265,7 +265,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGGHRD, CLARTG, CLASET, CUNM22, CROT, CGEMM,
@@ -280,8 +281,13 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
-      WORK( 1 ) = CMPLX( LWKOPT )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
       INITZ = LSAME( COMPZ, 'I' )
@@ -330,7 +336,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = CONE
          RETURN
@@ -888,7 +893,8 @@
       IF ( JCOL.LT.IHI )
      $   CALL CGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
-      WORK( 1 ) = CMPLX( LWKOPT )
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cgglse.f b/lapack-netlib/SRC/cgglse.f
index cca20dfed..b1c562385 100644
--- a/lapack-netlib/SRC/cgglse.f
+++ b/lapack-netlib/SRC/cgglse.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERsolve
+*> \ingroup gglse
 *
 *  =====================================================================
       SUBROUTINE CGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK,
@@ -207,7 +207,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -246,7 +247,7 @@
             LWKMIN = M + N + P
             LWKOPT = P + MN + MAX( M, N )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
diff --git a/lapack-netlib/SRC/cggqrf.f b/lapack-netlib/SRC/cggqrf.f
index 0185f4e0d..309f170e8 100644
--- a/lapack-netlib/SRC/cggqrf.f
+++ b/lapack-netlib/SRC/cggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -236,7 +236,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -250,8 +251,8 @@
       NB2 = ILAENV( 1, 'CGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'CUNMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = LWKOPT
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -287,7 +288,7 @@
 *     RQ factorization of N-by-P matrix B: B = T*Z.
 *
       CALL CGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggrqf.f b/lapack-netlib/SRC/cggrqf.f
index 5227100da..8470a1ce2 100644
--- a/lapack-netlib/SRC/cggrqf.f
+++ b/lapack-netlib/SRC/cggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -235,7 +235,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -249,8 +250,8 @@
       NB2 = ILAENV( 1, 'CGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'CUNMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = LWKOPT
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -287,7 +288,7 @@
 *     QR factorization of P-by-N matrix B: B = Z*T
 *
       CALL CGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      WORK( 1 ) = SROUNDUP_LWORK( MAX( LOPT, INT( WORK( 1 ) ) ) )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cggsvd3.f b/lapack-netlib/SRC/cggsvd3.f
index f248aebd5..4c4b85bae 100644
--- a/lapack-netlib/SRC/cggsvd3.f
+++ b/lapack-netlib/SRC/cggsvd3.f
@@ -278,7 +278,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -333,7 +333,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/cggsvp3.f b/lapack-netlib/SRC/cggsvp3.f
index 008a053a2..e19f7efd5 100644
--- a/lapack-netlib/SRC/cggsvp3.f
+++ b/lapack-netlib/SRC/cggsvp3.f
@@ -233,7 +233,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -256,7 +256,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/cgsvj0.f b/lapack-netlib/SRC/cgsvj0.f
index 37853a10a..7426ecdf1 100644
--- a/lapack-netlib/SRC/cgsvj0.f
+++ b/lapack-netlib/SRC/cgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cgsvj1.f b/lapack-netlib/SRC/cgsvj1.f
index f1a5204e3..18dd690cc 100644
--- a/lapack-netlib/SRC/cgsvj1.f
+++ b/lapack-netlib/SRC/cgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chbev_2stage.f b/lapack-netlib/SRC/chbev_2stage.f
index 123d84729..f84d8d3d4 100644
--- a/lapack-netlib/SRC/chbev_2stage.f
+++ b/lapack-netlib/SRC/chbev_2stage.f
@@ -132,7 +132,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS
@@ -171,7 +171,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hbev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -189,7 +189,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -197,11 +197,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -240,8 +240,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, CLANHB
-      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV2STAGE
+      REAL               SLAMCH, CLANHB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR,
@@ -276,7 +277,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             IB    = ILAENV2STAGE( 2, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, -1, -1 )
@@ -285,7 +286,7 @@
             LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          ENDIF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -351,7 +352,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL CHETRD_HB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
-     $                    RWORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    RWORK( INDE ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, call CSTEQR.
@@ -377,7 +378,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chbevd.f b/lapack-netlib/SRC/chbevd.f
index 1598f4de5..a5afe6b76 100644
--- a/lapack-netlib/SRC/chbevd.f
+++ b/lapack-netlib/SRC/chbevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian band matrix A.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -207,7 +201,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hbevd
 *
 *  =====================================================================
       SUBROUTINE CHBEVD( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, WORK,
@@ -245,8 +239,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANHB, SLAMCH
-      EXTERNAL           LSAME, CLANHB, SLAMCH
+      REAL               CLANHB, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANHB, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMM, CHBTRD, CLACPY, CLASCL, CSTEDC, SSCAL,
@@ -294,7 +288,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -388,7 +382,7 @@
          CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
       RETURN
diff --git a/lapack-netlib/SRC/chbevd_2stage.f b/lapack-netlib/SRC/chbevd_2stage.f
index 340c546e8..3c9c8ecc0 100644
--- a/lapack-netlib/SRC/chbevd_2stage.f
+++ b/lapack-netlib/SRC/chbevd_2stage.f
@@ -47,12 +47,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/chbevx_2stage.f b/lapack-netlib/SRC/chbevx_2stage.f
index 22bced45f..1d609dfbd 100644
--- a/lapack-netlib/SRC/chbevx_2stage.f
+++ b/lapack-netlib/SRC/chbevx_2stage.f
@@ -22,7 +22,7 @@
 *
 *       SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
 *                                 Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
-*                                 Z, LDZ, WORK, LWORK, RWORK, IWORK, 
+*                                 Z, LDZ, WORK, LWORK, RWORK, IWORK,
 *                                 IFAIL, INFO )
 *
 *       IMPLICIT NONE
@@ -233,7 +233,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS
@@ -285,7 +285,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hbevx_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -303,7 +303,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -311,18 +311,18 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
 *  =====================================================================
       SUBROUTINE CHBEVX_2STAGE( JOBZ, RANGE, UPLO, N, KD, AB, LDAB,
      $                          Q, LDQ, VL, VU, IL, IU, ABSTOL, M, W,
-     $                          Z, LDZ, WORK, LWORK, RWORK, IWORK, 
+     $                          Z, LDZ, WORK, LWORK, RWORK, IWORK,
      $                          IFAIL, INFO )
 *
       IMPLICIT NONE
@@ -367,8 +367,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, CLANHB
-      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV2STAGE
+      REAL               SLAMCH, CLANHB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANHB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CCOPY,
@@ -424,16 +425,16 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             IB    = ILAENV2STAGE( 2, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, -1, -1 )
-            LHTRD = ILAENV2STAGE( 3, 'CHETRD_HB2ST', JOBZ, 
+            LHTRD = ILAENV2STAGE( 3, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, IB, -1 )
-            LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ, 
+            LWTRD = ILAENV2STAGE( 4, 'CHETRD_HB2ST', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          ENDIF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -637,7 +638,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chbgvd.f b/lapack-netlib/SRC/chbgvd.f
index c4ad20753..00fb2b5f5 100644
--- a/lapack-netlib/SRC/chbgvd.f
+++ b/lapack-netlib/SRC/chbgvd.f
@@ -46,12 +46,6 @@
 *> and banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -238,7 +232,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hbgvd
 *
 *> \par Contributors:
 *  ==================
@@ -281,7 +275,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSTERF, XERBLA, CGEMM, CHBGST, CHBTRD, CLACPY,
@@ -328,7 +323,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -394,7 +389,7 @@
          CALL CLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
       RETURN
diff --git a/lapack-netlib/SRC/cheev.f b/lapack-netlib/SRC/cheev.f
index fb8e451df..60df7d8b8 100644
--- a/lapack-netlib/SRC/cheev.f
+++ b/lapack-netlib/SRC/cheev.f
@@ -132,7 +132,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heev
 *
 *  =====================================================================
       SUBROUTINE CHEEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, RWORK,
@@ -169,8 +169,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANHE, SLAMCH
-      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH
+      REAL               CLANHE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRD, CLASCL, CSTEQR, CUNGTR, SSCAL, SSTERF,
@@ -201,7 +201,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, ( NB+1 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, 2*N-1 ) .AND. .NOT.LQUERY )
      $      INFO = -8
@@ -286,7 +286,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cheev_2stage.f b/lapack-netlib/SRC/cheev_2stage.f
index fb7989d9f..4e1cecc64 100644
--- a/lapack-netlib/SRC/cheev_2stage.f
+++ b/lapack-netlib/SRC/cheev_2stage.f
@@ -106,12 +106,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -149,7 +149,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -167,7 +167,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -175,11 +175,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -220,8 +220,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, CLANHE
-      EXTERNAL           LSAME, SLAMCH, CLANHE, ILAENV2STAGE
+      REAL               SLAMCH, CLANHE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANHE, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSCAL, SSTERF, XERBLA, CLASCL, CSTEQR,
@@ -255,7 +256,7 @@
          LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
          LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
          LWMIN = N + LHTRD + LWTRD
-         WORK( 1 )  = LWMIN
+         WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
      $      INFO = -8
@@ -314,7 +315,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, RWORK( INDE ),
-     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, first call
@@ -343,7 +344,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cheevd.f b/lapack-netlib/SRC/cheevd.f
index 2ddf74b98..9b62a2df6 100644
--- a/lapack-netlib/SRC/cheevd.f
+++ b/lapack-netlib/SRC/cheevd.f
@@ -41,12 +41,6 @@
 *> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -122,8 +116,7 @@
 *>
 *> \param[out] RWORK
 *> \verbatim
-*>          RWORK is REAL array,
-*>                                         dimension (LRWORK)
+*>          RWORK is REAL array, dimension (MAX(1,LRWORK))
 *>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
 *> \endverbatim
 *>
@@ -186,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevd
 *
 *> \par Further Details:
 *  =====================
@@ -236,8 +229,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANHE, SLAMCH
-      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH
+      REAL               CLANHE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRD, CLACPY, CLASCL, CSTEDC, CUNMTR, SSCAL,
@@ -288,8 +281,8 @@
             LROPT = LRWMIN
             LIOPT = LIWMIN
          END IF
-         WORK( 1 ) = LOPT
-         RWORK( 1 ) = LROPT
+         WORK( 1 ) = SROUNDUP_LWORK( LOPT )
+         RWORK( 1 ) = SROUNDUP_LWORK( LROPT )
          IWORK( 1 ) = LIOPT
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -384,8 +377,8 @@
          CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
       END IF
 *
-      WORK( 1 ) = LOPT
-      RWORK( 1 ) = LROPT
+      WORK( 1 ) = SROUNDUP_LWORK( LOPT )
+      RWORK( 1 ) = SROUNDUP_LWORK( LROPT )
       IWORK( 1 ) = LIOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/cheevd_2stage.f b/lapack-netlib/SRC/cheevd_2stage.f
index 830e13d30..a0e8843ae 100644
--- a/lapack-netlib/SRC/cheevd_2stage.f
+++ b/lapack-netlib/SRC/cheevd_2stage.f
@@ -46,12 +46,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f
index b8854b182..ad5c8cd4a 100644
--- a/lapack-netlib/SRC/cheevr.f
+++ b/lapack-netlib/SRC/cheevr.f
@@ -272,7 +272,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,2*N).
+*>          The length of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 2*N.
 *>          For optimal efficiency, LWORK >= (NB+1)*N,
 *>          where NB is the max of the blocksize for CHETRD and for
 *>          CUNMTR as returned by ILAENV.
@@ -294,7 +295,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -313,7 +315,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -338,7 +341,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -392,8 +395,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANSY, SLAMCH
-      EXTERNAL           LSAME, ILAENV, CLANSY, SLAMCH
+      REAL               CLANSY, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, CLANSY, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRD, CSSCAL, CSTEMR, CSTEIN, CSWAP, CUNMTR,
@@ -417,9 +420,15 @@
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
      $         ( LIWORK.EQ.-1 ) )
 *
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
-      LWMIN = MAX( 1, 2*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 2*N
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -454,8 +463,8 @@
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWKOPT
-         RWORK( 1 ) = LRWMIN
+         WORK( 1 )  = SROUNDUP_LWORK( LWKOPT )
+         RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -483,7 +492,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = REAL( A( 1, 1 ) )
@@ -710,8 +719,8 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
-      RWORK( 1 ) = LRWMIN
+      WORK( 1 )  = SROUNDUP_LWORK( LWKOPT )
+      RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f
index 0332a09bc..e06925fcd 100644
--- a/lapack-netlib/SRC/cheevr_2stage.f
+++ b/lapack-netlib/SRC/cheevr_2stage.f
@@ -265,7 +265,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of CSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the unitary transformations applied by CUNMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -279,12 +279,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -310,7 +311,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -329,7 +331,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -354,7 +357,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -382,7 +385,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -390,11 +393,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -443,8 +446,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV, ILAENV2STAGE
-      REAL               SLAMCH, CLANSY
-      EXTERNAL           LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE
+      REAL               SLAMCH, CLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANSY, ILAENV, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK 
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
@@ -472,9 +476,16 @@
       IB     = ILAENV2STAGE( 2, 'CHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = N + LHTRD + LWTRD
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = N + LHTRD + LWTRD
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -506,8 +517,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 )  = LWMIN
-         RWORK( 1 ) = LRWMIN
+         WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
+         RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -535,7 +546,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = REAL( A( 1, 1 ) )
@@ -643,9 +654,9 @@
 *
 *     Call CHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
 *
-      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), 
+      CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ),
      $                    RWORK( INDRE ), WORK( INDTAU ),
-     $                    WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWK ), LLWORK, IINFO )
 *
 *     If all eigenvalues are desired
@@ -666,7 +677,7 @@
             CALL SCOPY( N-1, RWORK( INDRE ), 1, RWORK( INDREE ), 1 )
             CALL SCOPY( N, RWORK( INDRD ), 1, RWORK( INDRDD ), 1 )
 *
-            IF (ABSTOL .LE. TWO*N*EPS) THEN
+            IF ( ABSTOL .LE. TWO*N*EPS ) THEN
                TRYRAC = .TRUE.
             ELSE
                TRYRAC = .FALSE.
@@ -765,8 +776,8 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 )  = LWMIN
-      RWORK( 1 ) = LRWMIN
+      WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
+      RWORK( 1 ) = SROUNDUP_LWORK( LRWMIN )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/cheevx.f b/lapack-netlib/SRC/cheevx.f
index 1cec902aa..a8a2bde63 100644
--- a/lapack-netlib/SRC/cheevx.f
+++ b/lapack-netlib/SRC/cheevx.f
@@ -250,7 +250,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevx
 *
 *  =====================================================================
       SUBROUTINE CHEEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU,
@@ -294,8 +294,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANHE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANHE
+      REAL               SLAMCH, CLANHE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, CLANHE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
@@ -348,14 +348,14 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = LWKMIN
+            LWKOPT = 1
          ELSE
             LWKMIN = 2*N
             NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'CUNMTR', UPLO, N, -1, -1, -1 ) )
-            LWKOPT = MAX( 1, ( NB + 1 )*N )
-            WORK( 1 ) = LWKOPT
+            LWKOPT = ( NB + 1 )*N
          END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -17
@@ -552,7 +552,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cheevx_2stage.f b/lapack-netlib/SRC/cheevx_2stage.f
index 04f6e30e8..70a681ec4 100644
--- a/lapack-netlib/SRC/cheevx_2stage.f
+++ b/lapack-netlib/SRC/cheevx_2stage.f
@@ -209,12 +209,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 8*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -265,7 +265,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup heevx_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -283,7 +283,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -291,11 +291,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -335,7 +335,7 @@
       CHARACTER          ORDER
       INTEGER            I, IINFO, IMAX, INDD, INDE, INDEE, INDIBL,
      $                   INDISP, INDIWK, INDRWK, INDTAU, INDWRK, ISCALE,
-     $                   ITMP1, J, JJ, LLWORK, 
+     $                   ITMP1, J, JJ, LLWORK,
      $                   NSPLIT, LWMIN, LHTRD, LWTRD, KD, IB, INDHOUS
       REAL               ABSTLL, ANRM, BIGNUM, EPS, RMAX, RMIN, SAFMIN,
      $                   SIGMA, SMLNUM, TMP1, VLL, VUU
@@ -343,8 +343,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, CLANHE
-      EXTERNAL           LSAME, SLAMCH, CLANHE, ILAENV2STAGE
+      REAL               SLAMCH, CLANHE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, CLANHE, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTERF, XERBLA, CSSCAL,
@@ -397,7 +398,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             KD    = ILAENV2STAGE( 1, 'CHETRD_2STAGE', JOBZ,
      $                            N, -1, -1, -1 )
@@ -408,7 +409,7 @@
             LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = N + LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          END IF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -499,7 +500,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL CHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDD ),
-     $                    RWORK( INDE ), WORK( INDTAU ), 
+     $                    RWORK( INDE ), WORK( INDTAU ),
      $                    WORK( INDHOUS ), LHTRD, WORK( INDWRK ),
      $                    LLWORK, IINFO )
 *
@@ -610,7 +611,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chegv.f b/lapack-netlib/SRC/chegv.f
index f7675a19f..53f9d5196 100644
--- a/lapack-netlib/SRC/chegv.f
+++ b/lapack-netlib/SRC/chegv.f
@@ -160,7 +160,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup hegv
 *
 *  =====================================================================
       SUBROUTINE CHEGV( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK,
@@ -206,7 +206,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHEEV, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA
@@ -240,7 +241,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, ( NB + 1 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, 2*N-1 ) .AND. .NOT.LQUERY ) THEN
             INFO = -11
@@ -309,7 +310,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chegv_2stage.f b/lapack-netlib/SRC/chegv_2stage.f
index 472581c4b..8de1f7f06 100644
--- a/lapack-netlib/SRC/chegv_2stage.f
+++ b/lapack-netlib/SRC/chegv_2stage.f
@@ -144,12 +144,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -179,7 +179,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -192,7 +192,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup hegv_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -210,7 +210,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -218,11 +218,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -259,7 +259,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHEGST, CPOTRF, CTRMM, CTRSM,
@@ -297,7 +298,7 @@
          LHTRD = ILAENV2STAGE( 3, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
          LWTRD = ILAENV2STAGE( 4, 'CHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
          LWMIN = N + LHTRD + LWTRD
-         WORK( 1 )  = LWMIN
+         WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -11
@@ -327,7 +328,7 @@
 *     Transform problem to standard eigenvalue problem and solve.
 *
       CALL CHEGST( ITYPE, UPLO, N, A, LDA, B, LDB, INFO )
-      CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, 
+      CALL CHEEV_2STAGE( JOBZ, UPLO, N, A, LDA, W,
      $                   WORK, LWORK, RWORK, INFO )
 *
       IF( WANTZ ) THEN
@@ -367,7 +368,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chegvd.f b/lapack-netlib/SRC/chegvd.f
index 4b7f43d52..d2dc941e6 100644
--- a/lapack-netlib/SRC/chegvd.f
+++ b/lapack-netlib/SRC/chegvd.f
@@ -43,12 +43,6 @@
 *> B are assumed to be Hermitian and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -212,7 +206,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -225,7 +219,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup hegvd
 *
 *> \par Further Details:
 *  =====================
@@ -274,7 +268,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHEEVD, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA
@@ -322,7 +317,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
          RWORK( 1 ) = LROPT
          IWORK( 1 ) = LIOPT
 *
@@ -398,7 +393,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
       RWORK( 1 ) = LROPT
       IWORK( 1 ) = LIOPT
 *
diff --git a/lapack-netlib/SRC/chegvx.f b/lapack-netlib/SRC/chegvx.f
index 6e428242d..172d0571e 100644
--- a/lapack-netlib/SRC/chegvx.f
+++ b/lapack-netlib/SRC/chegvx.f
@@ -280,7 +280,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -293,7 +293,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEeigen
+*> \ingroup hegvx
 *
 *> \par Contributors:
 *  ==================
@@ -335,7 +335,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHEEVX, CHEGST, CPOTRF, CTRMM, CTRSM, XERBLA
@@ -390,7 +391,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, ( NB + 1 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -464,7 +465,7 @@
 *
 *     Set WORK(1) to optimal complex workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv.f b/lapack-netlib/SRC/chesv.f
index 238fb0a94..cea1235b7 100644
--- a/lapack-netlib/SRC/chesv.f
+++ b/lapack-netlib/SRC/chesv.f
@@ -163,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesv
 *
 *  =====================================================================
       SUBROUTINE CHESV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -191,7 +191,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHETRF, CHETRS, CHETRS2
@@ -226,7 +227,7 @@
             NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -259,7 +260,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f
index c9b97e09c..0f41c9332 100644
--- a/lapack-netlib/SRC/chesv_aa.f
+++ b/lapack-netlib/SRC/chesv_aa.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE CHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -177,12 +177,13 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHETRF_AA, CHETRS_AA
@@ -196,6 +197,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -206,18 +208,18 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 2*N, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_HETRF = INT( WORK(1) )
+         LWKOPT_HETRF = INT( WORK( 1 ) )
          CALL CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_HETRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS )
-         WORK( 1 ) = LWKOPT
+         LWKOPT_HETRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -239,7 +241,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f
index e474c6fad..05ebd9253 100644
--- a/lapack-netlib/SRC/chesv_aa_2stage.f
+++ b/lapack-netlib/SRC/chesv_aa_2stage.f
@@ -87,7 +87,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -99,14 +99,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX array, dimension (LTB)
+*>          TB is COMPLEX array, dimension (MAX(1,LTB)).
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -146,14 +146,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX workspace of size LWORK
+*>          WORK is COMPLEX workspace of size (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -177,7 +178,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE CHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -203,11 +204,12 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKMIN, LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRF_AA_2STAGE, CHETRS_AA_2STAGE,
@@ -224,6 +226,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -232,18 +235,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -253,7 +257,6 @@
          RETURN
       END IF
 *
-*
 *     Compute the factorization A = U**H*T*U or A = L*T*L**H.
 *
       CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
@@ -267,7 +270,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_rk.f b/lapack-netlib/SRC/chesv_rk.f
index e123fa299..268a55e23 100644
--- a/lapack-netlib/SRC/chesv_rk.f
+++ b/lapack-netlib/SRC/chesv_rk.f
@@ -205,7 +205,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesv_rk
 *
 *> \par Contributors:
 *  ==================
@@ -247,7 +247,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHETRF_RK, CHETRS_3
@@ -282,7 +283,7 @@
             CALL CHETRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -304,7 +305,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesv_rook.f b/lapack-netlib/SRC/chesv_rook.f
index 8e0b1a88f..2a0d3fdaf 100644
--- a/lapack-netlib/SRC/chesv_rook.f
+++ b/lapack-netlib/SRC/chesv_rook.f
@@ -184,7 +184,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesv_rook
 *>
 *> \verbatim
 *>
@@ -225,7 +225,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CHETRF_ROOK, CHETRS_ROOK
@@ -260,7 +261,7 @@
             NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -283,7 +284,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chesvx.f b/lapack-netlib/SRC/chesvx.f
index 6da49bdcf..bdaad55ec 100644
--- a/lapack-netlib/SRC/chesvx.f
+++ b/lapack-netlib/SRC/chesvx.f
@@ -276,7 +276,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE CHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -307,14 +307,14 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKMIN, LWKOPT, NB
       REAL               ANORM
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANHE, SLAMCH
-      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH
+      REAL               CLANHE, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, CLANHE, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHECON, CHERFS, CHETRF, CHETRS, CLACPY, XERBLA
@@ -329,6 +329,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -346,17 +347,17 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 2*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -405,7 +406,7 @@
       IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
      $   INFO = N + 1
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chetf2_rk.f b/lapack-netlib/SRC/chetf2_rk.f
index 4bb032382..e687ec64a 100644
--- a/lapack-netlib/SRC/chetf2_rk.f
+++ b/lapack-netlib/SRC/chetf2_rk.f
@@ -480,7 +480,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -508,7 +508,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
@@ -834,7 +834,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -862,7 +862,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/chetf2_rook.f b/lapack-netlib/SRC/chetf2_rook.f
index ee4eaf68f..49fba1bda 100644
--- a/lapack-netlib/SRC/chetf2_rook.f
+++ b/lapack-netlib/SRC/chetf2_rook.f
@@ -420,7 +420,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -441,7 +441,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
@@ -733,7 +733,7 @@
                   A( J, K ) = CONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = CONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( K, K ) )
@@ -754,7 +754,7 @@
                   A( J, KK ) = CONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = CONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = REAL( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f
index f5ad35f27..ec7075798 100644
--- a/lapack-netlib/SRC/chetrd_2stage.f
+++ b/lapack-netlib/SRC/chetrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download CHETRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_2stage.f"> 
+*> Download CHETRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chetrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chetrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chetrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       COMPLEX            A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the unitary
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is COMPLEX array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is COMPLEX array, dimension (LHOUS2)
+*>          HOUS2 is COMPLEX array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2=-1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -143,13 +145,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -157,9 +162,9 @@
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +182,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +207,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +215,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE CHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -250,7 +255,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -265,10 +271,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'CHETRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'CHETRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'CHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'CHETRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -285,8 +294,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         HOUS2( 1 ) = LHMIN
-         WORK( 1 )  = LWMIN
+         HOUS2( 1 ) = SROUNDUP_LWORK( LHMIN )
+         WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -309,14 +318,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL CHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRD_HE2HB', -INFO )
          RETURN
       END IF
-      CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL CHETRD_HB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRD_HB2ST', -INFO )
@@ -324,8 +333,7 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
-      WORK( 1 )  = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CHETRD_2STAGE
diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F
index e047ab720..b0d3e45fb 100644
--- a/lapack-netlib/SRC/chetrd_hb2st.F
+++ b/lapack-netlib/SRC/chetrd_hb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the chetrd_he2hb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the chetrd_he2hb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the chetrd_he2hb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of chetrd_he2hb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -132,34 +132,39 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is COMPLEX array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is COMPLEX array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension LWORK.
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK)).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -188,7 +193,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +213,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +221,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE CHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *
@@ -259,11 +264,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   NBTILES, TTYPE, TID, NTHREADS,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SICEV, SIZETAU, LDV, LHMIN, LWMIN
       REAL               ABSTMP
@@ -277,15 +282,15 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
-      EXTERNAL           LSAME, ILAENV2STAGE
+      INTEGER            ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -294,9 +299,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'CHETRD_HB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'CHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -317,8 +327,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         HOUS( 1 ) = LHMIN
-         WORK( 1 ) = LWMIN
+         HOUS( 1 ) = SROUNDUP_LWORK( LHMIN )
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -358,7 +368,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -366,11 +376,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     complex because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     complex because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -385,17 +395,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = REAL( AB( ABDPOS, I ) )
@@ -444,7 +454,7 @@ C                 END IF
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the hermitian band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -453,7 +463,7 @@ C                 END IF
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL CLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL CLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -462,7 +472,7 @@ C                 END IF
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -471,7 +481,7 @@ C                 END IF
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -482,7 +492,7 @@ C                 END IF
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -508,16 +518,16 @@ C                 END IF
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) && _OPENMP >= 201307
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -525,20 +535,20 @@ C                 END IF
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL CHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
-#endif 
+     $                         WORK( INDW ) )
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -553,14 +563,14 @@ C                 END IF
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = REAL( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -574,11 +584,10 @@ C                 END IF
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CHETRD_HB2ST
 *
       END
-      
+
diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f
index 904555c10..42e71e0b2 100644
--- a/lapack-netlib/SRC/chetrd_he2hb.f
+++ b/lapack-netlib/SRC/chetrd_he2hb.f
@@ -123,8 +123,8 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -158,7 +160,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -283,7 +285,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE 
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -293,8 +296,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'CHETRD_HE2HB', '', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -313,7 +320,7 @@
          CALL XERBLA( 'CHETRD_HE2HB', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
          RETURN
       END IF
 *
@@ -506,7 +513,7 @@
 
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CHETRD_HE2HB
diff --git a/lapack-netlib/SRC/chetrf.f b/lapack-netlib/SRC/chetrf.f
index 484e76256..2836e30bc 100644
--- a/lapack-netlib/SRC/chetrf.f
+++ b/lapack-netlib/SRC/chetrf.f
@@ -107,7 +107,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK. LWORK >= 1. For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *> \endverbatim
 *>
@@ -130,7 +130,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -197,7 +197,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETF2, CLAHEF, XERBLA
@@ -227,8 +228,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -346,7 +347,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF
diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f
index 54567b8cd..51410a6ed 100644
--- a/lapack-netlib/SRC/chetrf_aa.f
+++ b/lapack-netlib/SRC/chetrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= 2*N. For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -125,10 +127,10 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -152,14 +154,15 @@
 *
 *     .. Local Scalars ..
       LOGICAL      LQUERY, UPPER
-      INTEGER      J, LWKOPT
+      INTEGER      J, LWKMIN, LWKOPT
       INTEGER      NB, MJ, NJ, K1, K2, J1, J2, J3, JB
       COMPLEX      ALPHA
 *     ..
 *     .. External Functions ..
       LOGICAL      LSAME
       INTEGER      ILAENV
-      EXTERNAL     LSAME, ILAENV
+      REAL         SROUNDUP_LWORK
+      EXTERNAL     LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL     CLAHEF_AA, CGEMM, CCOPY, CSWAP, CSCAL, XERBLA
@@ -178,19 +181,26 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.( 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -202,11 +212,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          A( 1, 1 ) = REAL( A( 1, 1 ) )
          RETURN
       END IF
@@ -459,7 +469,7 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF_AA
diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f
index 20a9859e8..a79343753 100644
--- a/lapack-netlib/SRC/chetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/chetrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -87,14 +87,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX array, dimension (LTB)
+*>          TB is COMPLEX array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX workspace of size LWORK
+*>          WORK is COMPLEX workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used
+*>          to select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -188,7 +188,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
       
 *     ..
 *     .. External Subroutines ..
@@ -213,9 +214,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -229,10 +230,10 @@
       NB = ILAENV( 1, 'CHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -241,7 +242,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
diff --git a/lapack-netlib/SRC/chetrf_rk.f b/lapack-netlib/SRC/chetrf_rk.f
index 3f60f4370..a13c740e3 100644
--- a/lapack-netlib/SRC/chetrf_rk.f
+++ b/lapack-netlib/SRC/chetrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension ( MAX(1,LWORK) ).
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -280,7 +280,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLAHEF_RK, CHETF2_RK, CSWAP, XERBLA
@@ -310,8 +311,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CHETRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -487,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF_RK
diff --git a/lapack-netlib/SRC/chetrf_rook.f b/lapack-netlib/SRC/chetrf_rook.f
index 805e0f4cb..df0323520 100644
--- a/lapack-netlib/SRC/chetrf_rook.f
+++ b/lapack-netlib/SRC/chetrf_rook.f
@@ -122,7 +122,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -150,7 +150,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
@@ -232,7 +232,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLAHEF_ROOK, CHETF2_ROOK, XERBLA
@@ -263,7 +264,7 @@
 *
          NB = ILAENV( 1, 'CHETRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -386,7 +387,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of CHETRF_ROOK
diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f
index 2865a6440..f15065ae7 100644
--- a/lapack-netlib/SRC/chetri2.f
+++ b/lapack-netlib/SRC/chetri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (N+NB+1)*(NB+3)
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE CHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -147,7 +147,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRI2X, CHETRI, XERBLA
@@ -159,9 +160,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'CHETRF', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +178,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = SROUNDUP_LWORK( MINSIZE )
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
 
-      IF( NBMAX .GE. N ) THEN
+      IF( NBMAX.GE.N ) THEN
          CALL CHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL CHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of CHETRI2
diff --git a/lapack-netlib/SRC/chetri_3.f b/lapack-netlib/SRC/chetri_3.f
index cc2318b22..ccfce5070 100644
--- a/lapack-netlib/SRC/chetri_3.f
+++ b/lapack-netlib/SRC/chetri_3.f
@@ -119,16 +119,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX array, dimension (N+NB+1)*(NB+3).
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= (N+NB+1)*(NB+3).
+*>          The length of WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>
-*>          If LDWORK = -1, then a workspace query is assumed;
+*>          If LWORK = -1, then a workspace query is assumed;
 *>          the routine only calculates the optimal size of the optimal
 *>          size of the WORK array, returns this value as the first
 *>          entry of the WORK array, and no error message related to
@@ -152,7 +153,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetri_3
 *
 *> \par Contributors:
 *  ==================
@@ -190,7 +191,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHETRI_3X, XERBLA
@@ -208,8 +210,13 @@
 *
 *     Determine the block size
 *
-      NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) )
-      LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      IF( N.EQ.0 ) THEN
+         LWKOPT = 1
+      ELSE
+         NB = MAX( 1, ILAENV( 1, 'CHETRI_3', UPLO, N, -1, -1, -1 ) )
+         LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
@@ -217,7 +224,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
          INFO = -8
       END IF
 *
@@ -225,7 +232,6 @@
          CALL XERBLA( 'CHETRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
@@ -236,7 +242,7 @@
 *
       CALL CHETRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f
index 2546adb2d..07179ab92 100644
--- a/lapack-netlib/SRC/chetrs_aa.f
+++ b/lapack-netlib/SRC/chetrs_aa.f
@@ -105,7 +105,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -123,7 +129,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE CHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -151,23 +157,30 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACPY, CLACGV, CGTSV, CSWAP, CTRSM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -178,21 +191,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'CHETRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LWKMIN )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/chpevd.f b/lapack-netlib/SRC/chpevd.f
index c44462394..2449783a2 100644
--- a/lapack-netlib/SRC/chpevd.f
+++ b/lapack-netlib/SRC/chpevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian matrix A in packed storage.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -192,7 +186,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hpevd
 *
 *  =====================================================================
       SUBROUTINE CHPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK,
@@ -229,8 +223,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANHP, SLAMCH
-      EXTERNAL           LSAME, CLANHP, SLAMCH
+      REAL               CLANHP, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANHP, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHPTRD, CSSCAL, CSTEDC, CUPMTR, SSCAL, SSTERF,
@@ -274,7 +268,7 @@
                LIWMIN = 1
             END IF
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -365,7 +359,7 @@
          CALL SSCAL( IMAX, ONE / SIGMA, W, 1 )
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
       RETURN
diff --git a/lapack-netlib/SRC/chpgv.f b/lapack-netlib/SRC/chpgv.f
index 417f10121..660724e05 100644
--- a/lapack-netlib/SRC/chpgv.f
+++ b/lapack-netlib/SRC/chpgv.f
@@ -144,7 +144,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chpgvd.f b/lapack-netlib/SRC/chpgvd.f
index 65d08b783..57ac4fc72 100644
--- a/lapack-netlib/SRC/chpgvd.f
+++ b/lapack-netlib/SRC/chpgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -205,7 +199,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -218,7 +212,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHEReigen
+*> \ingroup hpgvd
 *
 *> \par Contributors:
 *  ==================
@@ -252,7 +246,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHPEVD, CHPGST, CPPTRF, CTPMV, CTPSV, XERBLA
@@ -298,7 +293,7 @@
             END IF
          END IF
 *
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -380,7 +375,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
       RETURN
diff --git a/lapack-netlib/SRC/chpgvx.f b/lapack-netlib/SRC/chpgvx.f
index 711daf55f..2646800cc 100644
--- a/lapack-netlib/SRC/chpgvx.f
+++ b/lapack-netlib/SRC/chpgvx.f
@@ -250,7 +250,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f
index 007f72f59..56ff01fc6 100644
--- a/lapack-netlib/SRC/chseqr.f
+++ b/lapack-netlib/SRC/chseqr.f
@@ -216,7 +216,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup hseqr
 *
 *> \par Contributors:
 *  ==================
@@ -343,7 +343,8 @@
 *     .. External Functions ..
       INTEGER            ILAENV
       LOGICAL            LSAME
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CLACPY, CLAHQR, CLAQR0, CLASET, XERBLA
diff --git a/lapack-netlib/SRC/cla_gbrfsx_extended.f b/lapack-netlib/SRC/cla_gbrfsx_extended.f
index 6d43c8325..e8b96fd6e 100644
--- a/lapack-netlib/SRC/cla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/cla_gbrfsx_extended.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU,
+*       SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU,
 *                                       NRHS, AB, LDAB, AFB, LDAFB, IPIV,
 *                                       COLEQU, C, B, LDB, Y, LDY,
 *                                       BERR_OUT, N_NORMS, ERR_BNDS_NORM,
@@ -400,7 +400,7 @@
 *> \ingroup complexGBcomputational
 *
 *  =====================================================================
-      SUBROUTINE CLA_GBRFSX_EXTENDED ( PREC_TYPE, TRANS_TYPE, N, KL, KU,
+      SUBROUTINE CLA_GBRFSX_EXTENDED( PREC_TYPE, TRANS_TYPE, N, KL, KU,
      $                                NRHS, AB, LDAB, AFB, LDAFB, IPIV,
      $                                COLEQU, C, B, LDB, Y, LDY,
      $                                BERR_OUT, N_NORMS, ERR_BNDS_NORM,
@@ -651,7 +651,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_gerfsx_extended.f b/lapack-netlib/SRC/cla_gerfsx_extended.f
index 4b1031101..e524db5ad 100644
--- a/lapack-netlib/SRC/cla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/cla_gerfsx_extended.f
@@ -637,7 +637,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, (1.0E+0,0.0E+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_herfsx_extended.f b/lapack-netlib/SRC/cla_herfsx_extended.f
index 6d007ef58..19f845692 100644
--- a/lapack-netlib/SRC/cla_herfsx_extended.f
+++ b/lapack-netlib/SRC/cla_herfsx_extended.f
@@ -654,7 +654,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_porfsx_extended.f b/lapack-netlib/SRC/cla_porfsx_extended.f
index 9ced9b1b9..2dafecb35 100644
--- a/lapack-netlib/SRC/cla_porfsx_extended.f
+++ b/lapack-netlib/SRC/cla_porfsx_extended.f
@@ -625,7 +625,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/cla_porpvgrw.f b/lapack-netlib/SRC/cla_porpvgrw.f
index 1eb706d1a..78cd19da9 100644
--- a/lapack-netlib/SRC/cla_porpvgrw.f
+++ b/lapack-netlib/SRC/cla_porpvgrw.f
@@ -140,9 +140,9 @@
 *     .. Executable Statements ..
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     SPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     SPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/cla_syrfsx_extended.f b/lapack-netlib/SRC/cla_syrfsx_extended.f
index 4fe538a98..95f969731 100644
--- a/lapack-netlib/SRC/cla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/cla_syrfsx_extended.f
@@ -654,7 +654,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL CAXPY( N, CMPLX(1.0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/claed7.f b/lapack-netlib/SRC/claed7.f
index 9d2f97141..72f1417d4 100644
--- a/lapack-netlib/SRC/claed7.f
+++ b/lapack-netlib/SRC/claed7.f
@@ -363,7 +363,7 @@
             RETURN
          END IF
 *
-*     Prepare the INDXQ sorting premutation.
+*     Prepare the INDXQ sorting permutation.
 *
          N1 = K
          N2 = N - K
diff --git a/lapack-netlib/SRC/claed8.f b/lapack-netlib/SRC/claed8.f
index c15a0365f..1600087ab 100644
--- a/lapack-netlib/SRC/claed8.f
+++ b/lapack-netlib/SRC/claed8.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+*       SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
 *                          Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INFO )
 *
@@ -29,7 +29,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+*       REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
 *      $                   Z( * )
 *       COMPLEX            Q( LDQ, * ), Q2( LDQ2, * )
 *       ..
@@ -122,9 +122,9 @@
 *>         destroyed during the updating process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (N)
+*>          DLAMBDA is REAL array, dimension (N)
 *>         Contains a copy of the first K eigenvalues which will be used
 *>         by SLAED3 to form the secular equation.
 *> \endverbatim
@@ -222,7 +222,7 @@
 *> \ingroup complexOTHERcomputational
 *
 *  =====================================================================
-      SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+      SUBROUTINE CLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
      $                   Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INFO )
 *
@@ -237,7 +237,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+      REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
      $                   Z( * )
       COMPLEX            Q( LDQ, * ), Q2( LDQ2, * )
 *     ..
@@ -322,14 +322,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -438,7 +438,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -450,19 +450,19 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   100 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       DO 110 J = 1, N
          JP = INDXP( J )
-         DLAMDA( J ) = D( JP )
+         DLAMBDA( J ) = D( JP )
          PERM( J ) = INDXQ( INDX( JP ) )
          CALL CCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   110 CONTINUE
@@ -471,7 +471,7 @@
 *     into the last N - K slots of D and Q respectively.
 *
       IF( K.LT.N ) THEN
-         CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+         CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          CALL CLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ),
      $                LDQ )
       END IF
diff --git a/lapack-netlib/SRC/clals0.f b/lapack-netlib/SRC/clals0.f
index e981fc36f..0b545d5d7 100644
--- a/lapack-netlib/SRC/clals0.f
+++ b/lapack-netlib/SRC/clals0.f
@@ -392,6 +392,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      RWORK( I ) = POLES( I, 2 )*Z( I ) /
      $                            ( SLAMC3( POLES( I, 2 ), DSIGJ )-
      $                            DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -470,6 +475,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent optimizing
+*                    compilers from doing x+(y+z).
+*
                      RWORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1,
      $                            2 ) )-DIFR( I, 1 ) ) /
      $                            ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/clalsa.f b/lapack-netlib/SRC/clalsa.f
index 06883ab20..fceb380f5 100644
--- a/lapack-netlib/SRC/clalsa.f
+++ b/lapack-netlib/SRC/clalsa.f
@@ -42,9 +42,9 @@
 *>
 *> \verbatim
 *>
-*> CLALSA is an itermediate step in solving the least squares problem
+*> CLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, CLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/clalsd.f b/lapack-netlib/SRC/clalsd.f
index a2da9a925..bdd6b31c5 100644
--- a/lapack-netlib/SRC/clalsd.f
+++ b/lapack-netlib/SRC/clalsd.f
@@ -48,12 +48,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f
index 1606cc611..8f474a3ab 100644
--- a/lapack-netlib/SRC/clamswlq.f
+++ b/lapack-netlib/SRC/clamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,92 +192,104 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE CLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX           A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      COMPLEX            A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL    CTPMLQT, CGEMLQT, XERBLA
+      EXTERNAL           CTPMLQT, CGEMLQT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
       IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN
         CALL CGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
       END IF
 *
@@ -401,7 +416,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLAMSWLQ
diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f
index 5677420ac..13625087f 100644
--- a/lapack-netlib/SRC/clamtsqr.f
+++ b/lapack-netlib/SRC/clamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,46 +193,51 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE CLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX        A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      COMPLEX            A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL   CGEMQRT, CTPMQRT, XERBLA
+      EXTERNAL           CGEMQRT, CTPMQRT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +245,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +266,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL CGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -409,7 +422,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLAMTSQR
diff --git a/lapack-netlib/SRC/claqp2rk.c b/lapack-netlib/SRC/claqp2rk.c
new file mode 100644
index 000000000..4184c5927
--- /dev/null
+++ b/lapack-netlib/SRC/claqp2rk.c
@@ -0,0 +1,943 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int claqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, real *abstol, real *reltol, integer *kp1, 
+	real *maxc2nrm, complex *a, integer *lda, integer *k, real *maxc2nrmk,
+	 real *relmaxc2nrmk, integer *jpiv, complex *tau, real *vn1, real *
+	vn2, complex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1;
+    complex q__1;
+
+    /* Local variables */
+    complex aikk;
+    real temp, temp2;
+    integer i__, j;
+    real tol3z;
+    integer jmaxc2nrm;
+    extern /* Subroutine */ int clarf_(char *, integer *, integer *, complex *
+	    , integer *, complex *, complex *, integer *, complex *), 
+	    cswap_(integer *, complex *, integer *, complex *, integer *);
+    integer itemp, minmnfact;
+    real myhugeval;
+    integer minmnupdt;
+    extern real scnrm2_(integer *, complex *, integer *);
+    integer kk, kp;
+    extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, 
+	    integer *, complex *);
+    extern real slamch_(char *);
+    extern integer isamax_(integer *, real *, integer *);
+    real taunan;
+    extern logical sisnan_(real *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + isamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0.f, tau[i__3].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0.f, tau[i__3].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = CZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    clarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    i__2 = kk;
+	    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since CLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by CLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by CLARFG is covered by checking TAU(KK) for NaN. */
+
+	i__2 = kk;
+	r__1 = tau[i__2].r;
+	if (sisnan_(&r__1)) {
+	    i__2 = kk;
+	    taunan = tau[i__2].r;
+	} else /* if(complicated condition) */ {
+	    r__1 = r_imag(&tau[kk]);
+	    if (sisnan_(&r__1)) {
+		taunan = r_imag(&tau[kk]);
+	    } else {
+		taunan = 0.f;
+	    }
+	}
+
+	if (sisnan_(&taunan)) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    i__2 = i__ + kk * a_dim1;
+	    aikk.r = a[i__2].r, aikk.i = a[i__2].i;
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = 1.f, a[i__2].i = 0.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    r_cnjg(&q__1, &tau[kk]);
+	    clarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &q__1, 
+		    &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = aikk.r, a[i__2].i = aikk.i;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    r__1 = c_abs(&a[i__ + j * a_dim1]) / vn1[j];
+		    temp = 1.f - r__1 * r__1;
+		    temp = f2cmax(temp,0.f);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = scnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.f;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = j;
+	tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+    }
+
+    return 0;
+
+/*     End of CLAQP2RK */
+
+} /* claqp2rk_ */
+
diff --git a/lapack-netlib/SRC/claqp2rk.f b/lapack-netlib/SRC/claqp2rk.f
new file mode 100644
index 000000000..6b1db085a
--- /dev/null
+++ b/lapack-netlib/SRC/claqp2rk.f
@@ -0,0 +1,726 @@
+*> \brief \b CLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/claqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/claqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/claqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      REAL               VN1( * ), VN2( * )
+*      COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*     $
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of the complex matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine CGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is REAL
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine CGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the unitary matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (N-1)
+*>          Used in CLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      REAL               VN1( * ), VN2( * )
+      COMPLEX            A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      REAL               HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX            AIKK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLARF, CLARFG, CSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SCNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SCNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL CSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = CZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL CLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = CZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since CLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by CLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by CLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( SISNAN( REAL( TAU(KK) ) ) ) THEN
+            TAUNAN = REAL( TAU(KK) )
+         ELSE IF( SISNAN( IMAG( TAU(KK) ) ) ) THEN
+            TAUNAN = IMAG( TAU(KK) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( SISNAN( TAUNAN ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = CONE
+            CALL CLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  CONJG( TAU( KK ) ), A( I, KK+1 ), LDA,
+     $                  WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = SCNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = CZERO
+      END DO
+*
+      RETURN
+*
+*     End of CLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/claqp3rk.c b/lapack-netlib/SRC/claqp3rk.c
new file mode 100644
index 000000000..ca305fab7
--- /dev/null
+++ b/lapack-netlib/SRC/claqp3rk.c
@@ -0,0 +1,1152 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static complex c_b1 = {0.f,0.f};
+static complex c_b2 = {1.f,0.f};
+static integer c__1 = 1;
+
+/* Subroutine */ int claqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, real *abstol, real *reltol, integer *kp1, real 
+	*maxc2nrm, complex *a, integer *lda, logical *done, integer *kb, real 
+	*maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, complex *tau, real *
+	vn1, real *vn2, complex *auxv, complex *f, integer *ldf, integer *
+	iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2, i__3;
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    real temp, temp2;
+    integer i__, j, k;
+    real tol3z;
+    extern /* Subroutine */ int cgemm_(char *, char *, integer *, integer *, 
+	    integer *, complex *, complex *, integer *, complex *, integer *, 
+	    complex *, complex *, integer *), cgemv_(char *, 
+	    integer *, integer *, complex *, complex *, integer *, complex *, 
+	    integer *, complex *, complex *, integer *), cswap_(
+	    integer *, complex *, integer *, complex *, integer *);
+    integer itemp, minmnfact;
+    real myhugeval;
+    integer minmnupdt;
+    extern real scnrm2_(integer *, complex *, integer *);
+    integer if__, kp;
+    extern /* Subroutine */ int clarfg_(integer *, complex *, complex *, 
+	    integer *, complex *);
+    extern real slamch_(char *);
+    integer lsticc;
+    extern integer isamax_(integer *, real *, integer *);
+    real taunan;
+    extern logical sisnan_(real *);
+    complex aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + isamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    q__1.r = -1.f, q__1.i = 0.f;
+		    cgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    i__2, kb, &q__1, &a[if__ + 1 + a_dim1], lda, &f[*
+			    kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb 
+			    + 1) * a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0.f, tau[i__2].i = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    cswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    cswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */
+
+	if (k > 1) {
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		r_cnjg(&q__1, &f[k + j * f_dim1]);
+		f[i__2].r = q__1.r, f[i__2].i = q__1.i;
+	    }
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    q__1.r = -1.f, q__1.i = 0.f;
+	    cgemv_("No transpose", &i__1, &i__2, &q__1, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1);
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		r_cnjg(&q__1, &f[k + j * f_dim1]);
+		f[i__2].r = q__1.r, f[i__2].i = q__1.i;
+	    }
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    clarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    i__1 = k;
+	    tau[i__1].r = 0.f, tau[i__1].i = 0.f;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since CLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by CLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by CLARFG is covered by checking TAU(K) for NaN. */
+
+	i__1 = k;
+	r__1 = tau[i__1].r;
+	if (sisnan_(&r__1)) {
+	    i__1 = k;
+	    taunan = tau[i__1].r;
+	} else /* if(complicated condition) */ {
+	    r__1 = r_imag(&tau[k]);
+	    if (sisnan_(&r__1)) {
+		taunan = r_imag(&tau[k]);
+	    } else {
+		taunan = 0.f;
+	    }
+	}
+
+	if (sisnan_(&taunan)) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		q__1.r = -1.f, q__1.i = 0.f;
+		cgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb,
+			 &q__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1],
+			 lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	i__1 = i__ + k * a_dim1;
+	aik.r = a[i__1].r, aik.i = a[i__1].i;
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = 1.f, a[i__1].i = 0.f;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    cgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k 
+		    + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &
+		    f[k + 1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j + k * f_dim1;
+	    f[i__2].r = 0.f, f[i__2].i = 0.f;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    i__3 = k;
+	    q__1.r = -tau[i__3].r, q__1.i = -tau[i__3].i;
+	    cgemv_("Conjugate Transpose", &i__1, &i__2, &q__1, &a[i__ + 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1]
+		    , &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    cgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    q__1.r = -1.f, q__1.i = 0.f;
+	    cgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, &
+		    q__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, &
+		    c_b2, &a[i__ + (k + 1) * a_dim1], lda);
+	}
+
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = aik.r, a[i__1].i = aik.i;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = c_abs(&a[i__ + j * a_dim1]) / vn1[j];
+/* Computing MAX */
+		    r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp);
+		    temp = f2cmax(r__1,r__2);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	q__1.r = -1.f, q__1.i = 0.f;
+	cgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &q__1,
+		 &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2,
+		 &a[if__ + 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        SCNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(SLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = scnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of CLAQP3RK */
+
+} /* claqp3rk_ */
+
diff --git a/lapack-netlib/SRC/claqp3rk.f b/lapack-netlib/SRC/claqp3rk.f
new file mode 100644
index 000000000..3703bcbd6
--- /dev/null
+++ b/lapack-netlib/SRC/claqp3rk.f
@@ -0,0 +1,947 @@
+*> \brief \b CLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/claqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/claqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/claqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      REAL               VN1( * ), VN2( * )
+*      COMPLEX*16         A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine CGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is REAL
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine CGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the unitary matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is COMPLEX array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is COMPLEX array, dimension (LDF,NB)
+*>          Matrix F**H = L*(Y**H)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               VN1( * ), VN2( * )
+      COMPLEX            A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+      COMPLEX            CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      REAL               HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX            AIK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGEMM, CGEMV, CLARFG, CSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, REAL, CONJG, IMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SCNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SCNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL CSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL CSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H.
+*
+         IF( K.GT.1 ) THEN
+            DO J = 1, K - 1
+               F( K, J ) = CONJG( F( K, J ) )
+            END DO
+            CALL CGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 )
+            DO J = 1, K - 1
+               F( K, J ) = CONJG( F( K, J ) )
+            END DO
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL CLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = CZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since CLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by CLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by CLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( SISNAN( REAL( TAU(K) ) ) ) THEN
+            TAUNAN = REAL( TAU(K) )
+         ELSE IF( SISNAN( IMAG( TAU(K) ) ) ) THEN
+            TAUNAN = IMAG( TAU(K) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( SISNAN( TAUNAN ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $               M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = CONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL CGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  CZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = CZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL CGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ),
+     $                   A( I, 1 ), LDA, A( I, K ), 1, CZERO,
+     $                   AUXV( 1 ), 1 )
+*
+            CALL CGEMV( 'No transpose', N+NRHS, K-1, CONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $            1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA,
+     $            F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL CGEMM( 'No transpose', 'Conjugate transpose',
+     $         M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        SCNRM2 does not fail on vectors with norm below the value of
+*        SQRT(SLAMCH('S'))
+*
+         VN1( LSTICC ) = SCNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of CLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/claqz0.f b/lapack-netlib/SRC/claqz0.f
index 6de40e06c..c6cc5847d 100644
--- a/lapack-netlib/SRC/claqz0.f
+++ b/lapack-netlib/SRC/claqz0.f
@@ -89,7 +89,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -310,7 +310,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET, SLABAD,
+      EXTERNAL :: XERBLA, CHGEQZ, CLAQZ2, CLAQZ3, CLASET,
      $            CLARTG, CROT
       REAL, EXTERNAL :: SLAMCH, CLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -462,7 +462,6 @@
 *     Get machine constants
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
@@ -533,7 +532,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f
index ee6959ed8..5f042e345 100644
--- a/lapack-netlib/SRC/clarfb_gett.f
+++ b/lapack-netlib/SRC/clarfb_gett.f
@@ -452,7 +452,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/clarfgp.f b/lapack-netlib/SRC/clarfgp.f
index b584484c7..980e93612 100644
--- a/lapack-netlib/SRC/clarfgp.f
+++ b/lapack-netlib/SRC/clarfgp.f
@@ -97,7 +97,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERauxiliary
+*> \ingroup larfgp
 *
 *  =====================================================================
       SUBROUTINE CLARFGP( N, ALPHA, X, INCX, TAU )
@@ -122,7 +122,7 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            J, KNT
-      REAL               ALPHI, ALPHR, BETA, BIGNUM, SMLNUM, XNORM
+      REAL               ALPHI, ALPHR, BETA, BIGNUM, EPS, SMLNUM, XNORM
       COMPLEX            SAVEALPHA
 *     ..
 *     .. External Functions ..
@@ -143,37 +143,28 @@
          RETURN
       END IF
 *
+      EPS = SLAMCH( 'Precision' )
       XNORM = SCNRM2( N-1, X, INCX )
       ALPHR = REAL( ALPHA )
       ALPHI = AIMAG( ALPHA )
 *
-      IF( XNORM.EQ.ZERO ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN
 *
 *        H  =  [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0.
 *
-         IF( ALPHI.EQ.ZERO ) THEN
-            IF( ALPHR.GE.ZERO ) THEN
-*              When TAU.eq.ZERO, the vector is special-cased to be
-*              all zeros in the application routines.  We do not need
-*              to clear it.
-               TAU = ZERO
-            ELSE
-*              However, the application routines rely on explicit
-*              zero checks when TAU.ne.ZERO, and we must clear X.
-               TAU = TWO
-               DO J = 1, N-1
-                  X( 1 + (J-1)*INCX ) = ZERO
-               END DO
-               ALPHA = -ALPHA
-            END IF
+         IF( ALPHR.GE.ZERO ) THEN
+*           When TAU.eq.ZERO, the vector is special-cased to be
+*           all zeros in the application routines.  We do not need
+*           to clear it.
+            TAU = ZERO
          ELSE
-*           Only "reflecting" the diagonal entry to be real and non-negative.
-            XNORM = SLAPY2( ALPHR, ALPHI )
-            TAU = CMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM )
+*           However, the application routines rely on explicit
+*           zero checks when TAU.ne.ZERO, and we must clear X.
+            TAU = TWO
             DO J = 1, N-1
                X( 1 + (J-1)*INCX ) = ZERO
             END DO
-            ALPHA = XNORM
+            ALPHA = -ALPHA
          END IF
       ELSE
 *
diff --git a/lapack-netlib/SRC/classq.f90 b/lapack-netlib/SRC/classq.f90
index cb4e7971f..c5f793cc0 100644
--- a/lapack-netlib/SRC/classq.f90
+++ b/lapack-netlib/SRC/classq.f90
@@ -34,28 +34,15 @@
 !>
 !> \verbatim
 !>
-!> CLASSQ  returns the values  scl  and  smsq  such that
+!> CLASSQ returns the values scale_out and sumsq_out such that
 !>
-!>    ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+!>    (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq,
 !>
-!> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is
 !> assumed to be non-negative.
 !>
 !> scale and sumsq must be supplied in SCALE and SUMSQ and
-!> scl and smsq are overwritten on SCALE and SUMSQ respectively.
-!>
-!> If scale * sqrt( sumsq ) > tbig then
-!>    we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
-!> and if 0 < scale * sqrt( sumsq ) < tsml then
-!>    we require:   scale <= sqrt( HUGE ) / ssml       on entry,
-!> where
-!>    tbig -- upper threshold for values whose square is representable;
-!>    sbig -- scaling constant for big numbers; \see la_constants.f90
-!>    tsml -- lower threshold for values whose square is representable;
-!>    ssml -- scaling constant for small numbers; \see la_constants.f90
-!> and
-!>    TINY*EPS -- tiniest representable number;
-!>    HUGE     -- biggest representable number.
+!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively.
 !>
 !> \endverbatim
 !
@@ -72,7 +59,7 @@
 !> \verbatim
 !>          X is COMPLEX array, dimension (1+(N-1)*abs(INCX))
 !>          The vector for which a scaled sum of squares is computed.
-!>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+!>             x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 !> \endverbatim
 !>
 !> \param[in] INCX
@@ -82,24 +69,24 @@
 !>          If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX = 0, x isn't a vector so there is no need to call
-!>          this subroutine.  If you call it anyway, it will count x(1)
+!>          this subroutine. If you call it anyway, it will count x(1)
 !>          in the vector norm N times.
 !> \endverbatim
 !>
 !> \param[in,out] SCALE
 !> \verbatim
 !>          SCALE is REAL
-!>          On entry, the value  scale  in the equation above.
-!>          On exit, SCALE is overwritten with  scl , the scaling factor
+!>          On entry, the value scale in the equation above.
+!>          On exit, SCALE is overwritten by scale_out, the scaling factor
 !>          for the sum of squares.
 !> \endverbatim
 !>
 !> \param[in,out] SUMSQ
 !> \verbatim
 !>          SUMSQ is REAL
-!>          On entry, the value  sumsq  in the equation above.
-!>          On exit, SUMSQ is overwritten with  smsq , the basic sum of
-!>          squares from which  scl  has been factored out.
+!>          On entry, the value sumsq in the equation above.
+!>          On exit, SUMSQ is overwritten by sumsq_out, the basic sum of
+!>          squares from which scale_out has been factored out.
 !> \endverbatim
 !
 !  Authors:
@@ -130,10 +117,10 @@
 !>
 !> \endverbatim
 !
-!> \ingroup OTHERauxiliary
+!> \ingroup lassq
 !
 !  =====================================================================
-subroutine CLASSQ( n, x, incx, scl, sumsq )
+subroutine CLASSQ( n, x, incx, scale, sumsq )
    use LA_CONSTANTS, &
       only: wp=>sp, zero=>szero, one=>sone, &
             sbig=>ssbig, ssml=>sssml, tbig=>stbig, tsml=>stsml
@@ -145,7 +132,7 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
 !
 !  .. Scalar Arguments ..
    integer :: incx, n
-   real(wp) :: scl, sumsq
+   real(wp) :: scale, sumsq
 !  ..
 !  .. Array Arguments ..
    complex(wp) :: x(*)
@@ -158,10 +145,10 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
 !
 !  Quick return if possible
 !
-   if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return
-   if( sumsq == zero ) scl = one
-   if( scl == zero ) then
-      scl = one
+   if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return
+   if( sumsq == zero ) scale = one
+   if( scale == zero ) then
+      scale = one
       sumsq = zero
    end if
    if (n <= 0) then
@@ -207,15 +194,27 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
 !  Put the existing sum of squares into one of the accumulators
 !
    if( sumsq > zero ) then
-      ax = scl*sqrt( sumsq )
+      ax = scale*sqrt( sumsq )
       if (ax > tbig) then
-!        We assume scl >= sqrt( TINY*EPS ) / sbig
-         abig = abig + (scl*sbig)**2 * sumsq
+         if (scale > one) then
+            scale = scale * sbig
+            abig = abig + scale * (scale * sumsq)
+         else
+            ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable
+            abig = abig + scale * (scale * (sbig * (sbig * sumsq)))
+         end if
       else if (ax < tsml) then
-!        We assume scl <= sqrt( HUGE ) / ssml
-         if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+         if (notbig) then
+            if (scale < one) then
+               scale = scale * ssml
+               asml = asml + scale * (scale * sumsq)
+            else
+               ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable
+               asml = asml + scale * (scale * (ssml * (ssml * sumsq)))
+            end if
+         end if
       else
-         amed = amed + scl**2 * sumsq
+         amed = amed + scale * (scale * sumsq)
       end if
    end if
 !
@@ -229,7 +228,7 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
       if (amed > zero .or. LA_ISNAN(amed)) then
          abig = abig + (amed*sbig)*sbig
       end if
-      scl = one / sbig
+      scale = one / sbig
       sumsq = abig
    else if (asml > zero) then
 !
@@ -245,17 +244,17 @@ subroutine CLASSQ( n, x, incx, scl, sumsq )
             ymin = asml
             ymax = amed
          end if
-         scl = one
+         scale = one
          sumsq = ymax**2*( one + (ymin/ymax)**2 )
       else
-         scl = one / ssml
+         scale = one / ssml
          sumsq = asml
       end if
    else
 !
 !     Otherwise all values are mid-range or zero
 !
-      scl = one
+      scale = one
       sumsq = amed
    end if
    return
diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f
index 63cbd02c6..2044e055c 100644
--- a/lapack-netlib/SRC/claswlq.f
+++ b/lapack-netlib/SRC/claswlq.f
@@ -96,21 +96,24 @@
 *>          The leading dimension of the array T.  LDT >= MB.
 *> \endverbatim
 *>
-*>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
-*>          The dimension of the array WORK.  LWORK >= MB*M.
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -158,40 +161,42 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      COMPLEX           A( LDA, * ), WORK( * ), T( LDT, *)
+      COMPLEX            A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      INTEGER            ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           CGELQT, CTPLQT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
-*     .. EXTERNAL FUNCTIONS ..
-      INTEGER            ILAENV
-      EXTERNAL           ILAENV
-*     ..
 *     .. EXECUTABLE STATEMENTS ..
 *
 *     TEST THE INPUT ARGUMENTS
@@ -199,12 +204,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LE.0 ) THEN
         INFO = -4
@@ -212,60 +224,61 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
         CALL CGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL CGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M , (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+        CALL CTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
      $                  LDA, T(1,CTR*M+1),
      $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL CTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
      $                  LDA, T(1,CTR*M+1), LDT,
      $                  WORK, INFO )
-       END IF
+      END IF
 *
-      WORK( 1 ) = M * MB
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLASWLQ
diff --git a/lapack-netlib/SRC/clatdf.f b/lapack-netlib/SRC/clatdf.f
index 5445e387e..1e7d71669 100644
--- a/lapack-netlib/SRC/clatdf.f
+++ b/lapack-netlib/SRC/clatdf.f
@@ -227,7 +227,7 @@
             BM = RHS( J ) - CONE
             SPLUS = ONE
 *
-*           Lockahead for L- part RHS(1:N-1) = +-1
+*           Look-ahead for L- part RHS(1:N-1) = +-1
 *           SPLUS and SMIN computed more efficiently than in BSOLVE[1].
 *
             SPLUS = SPLUS + REAL( CDOTC( N-J, Z( J+1, J ), 1, Z( J+1,
diff --git a/lapack-netlib/SRC/clatrs3.f b/lapack-netlib/SRC/clatrs3.f
index a902f1ed0..354141a8b 100644
--- a/lapack-netlib/SRC/clatrs3.f
+++ b/lapack-netlib/SRC/clatrs3.f
@@ -152,13 +152,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK).
+*>          WORK is REAL array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -166,6 +170,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -182,7 +187,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -257,15 +262,16 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, CLANGE, SLARMM
-      EXTERNAL           ILAENV, LSAME, SLAMCH, CLANGE, SLARMM
+      REAL               SLAMCH, CLANGE, SLARMM, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, CLANGE, SLARMM,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLATRS, CSSCAL, XERBLA
@@ -296,15 +302,24 @@
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
 *     Test the input parameters.
 *
@@ -326,7 +341,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -577,7 +592,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2-K1
@@ -659,6 +674,9 @@
             END IF
          END DO
       END DO
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+*
       RETURN
 *
 *     End of CLATRS3
diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f
index 7a0b85487..67403693f 100644
--- a/lapack-netlib/SRC/clatsqr.f
+++ b/lapack-netlib/SRC/clatsqr.f
@@ -101,14 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -160,33 +164,38 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      COMPLEX           A( LDA, * ), WORK( * ), T(LDT, *)
+      COMPLEX            A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, LWMIN, MINMN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
-      EXTERNAL    CGEQRT, CTPQRT, XERBLA
+      EXTERNAL           CGEQRT, CTPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -197,6 +206,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -204,64 +220,65 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'CLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      IF ( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL CGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
-       CTR = 1
+      CALL CGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+        CALL CTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
      $                 T(1,CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+     $                 LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+      IF( II.LE.M ) THEN
+        CALL CTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
      $                 T(1, CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                 WORK, INFO )
+      END IF
 *
-      work( 1 ) = N*NB
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of CLATSQR
diff --git a/lapack-netlib/SRC/cpbsv.f b/lapack-netlib/SRC/cpbsv.f
index 248abbc1f..889bbde08 100644
--- a/lapack-netlib/SRC/cpbsv.f
+++ b/lapack-netlib/SRC/cpbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cpbsvx.f b/lapack-netlib/SRC/cpbsvx.f
index 652e18501..975c87768 100644
--- a/lapack-netlib/SRC/cpbsvx.f
+++ b/lapack-netlib/SRC/cpbsvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -280,10 +280,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cpbtf2.f b/lapack-netlib/SRC/cpbtf2.f
index 0be2c0a7f..f5bc9b3a6 100644
--- a/lapack-netlib/SRC/cpbtf2.f
+++ b/lapack-netlib/SRC/cpbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpbtrf.f b/lapack-netlib/SRC/cpbtrf.f
index a4c18efb3..af60780c8 100644
--- a/lapack-netlib/SRC/cpbtrf.f
+++ b/lapack-netlib/SRC/cpbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpftrf.f b/lapack-netlib/SRC/cpftrf.f
index cbaab6832..12799c6f7 100644
--- a/lapack-netlib/SRC/cpftrf.f
+++ b/lapack-netlib/SRC/cpftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *>
 *>  Further Notes on RFP Format:
diff --git a/lapack-netlib/SRC/cposv.f b/lapack-netlib/SRC/cposv.f
index f37dfa3c0..ea6fc37db 100644
--- a/lapack-netlib/SRC/cposv.f
+++ b/lapack-netlib/SRC/cposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cposvx.f b/lapack-netlib/SRC/cposvx.f
index 78b9f4db1..322a26447 100644
--- a/lapack-netlib/SRC/cposvx.f
+++ b/lapack-netlib/SRC/cposvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -276,10 +276,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cposvxx.f b/lapack-netlib/SRC/cposvxx.f
index 7834c75da..c40a2d856 100644
--- a/lapack-netlib/SRC/cposvxx.f
+++ b/lapack-netlib/SRC/cposvxx.f
@@ -87,7 +87,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/cpotf2.f b/lapack-netlib/SRC/cpotf2.f
index 2f4658bae..d84988949 100644
--- a/lapack-netlib/SRC/cpotf2.f
+++ b/lapack-netlib/SRC/cpotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpotrf.f b/lapack-netlib/SRC/cpotrf.f
index 6aba3103e..e2b120a49 100644
--- a/lapack-netlib/SRC/cpotrf.f
+++ b/lapack-netlib/SRC/cpotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cpotrf2.f b/lapack-netlib/SRC/cpotrf2.f
index e1eae3e9d..ea2e4ca98 100644
--- a/lapack-netlib/SRC/cpotrf2.f
+++ b/lapack-netlib/SRC/cpotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cppsv.f b/lapack-netlib/SRC/cppsv.f
index 1e6f02695..a8fd660c4 100644
--- a/lapack-netlib/SRC/cppsv.f
+++ b/lapack-netlib/SRC/cppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cppsvx.f b/lapack-netlib/SRC/cppsvx.f
index f6f07538c..2ef02100f 100644
--- a/lapack-netlib/SRC/cppsvx.f
+++ b/lapack-netlib/SRC/cppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix, L is a lower triangular
 *>    matrix, and **H indicates conjugate transpose.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cpptrf.f b/lapack-netlib/SRC/cpptrf.f
index 4e81458cb..e36f834cb 100644
--- a/lapack-netlib/SRC/cpptrf.f
+++ b/lapack-netlib/SRC/cpptrf.f
@@ -79,9 +79,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
-*>                completed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive definite, and the factorization could
+*>                not be completed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cpteqr.f b/lapack-netlib/SRC/cpteqr.f
index e3af59041..fc9c44908 100644
--- a/lapack-netlib/SRC/cpteqr.f
+++ b/lapack-netlib/SRC/cpteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/cptsv.f b/lapack-netlib/SRC/cptsv.f
index 20ee32bbe..4c16f6a0a 100644
--- a/lapack-netlib/SRC/cptsv.f
+++ b/lapack-netlib/SRC/cptsv.f
@@ -94,8 +94,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cptsvx.f b/lapack-netlib/SRC/cptsvx.f
index db63a3c36..6f7d8cf5b 100644
--- a/lapack-netlib/SRC/cptsvx.f
+++ b/lapack-netlib/SRC/cptsvx.f
@@ -60,7 +60,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**H*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -205,10 +205,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/cpttrf.f b/lapack-netlib/SRC/cpttrf.f
index c6379e923..111343b78 100644
--- a/lapack-netlib/SRC/cpttrf.f
+++ b/lapack-netlib/SRC/cpttrf.f
@@ -71,8 +71,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/crscl.c b/lapack-netlib/SRC/crscl.c
new file mode 100644
index 000000000..7c87553d5
--- /dev/null
+++ b/lapack-netlib/SRC/crscl.c
@@ -0,0 +1,735 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar. */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download CRSCL + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f
+"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE CRSCL( N, A, X, INCX ) */
+
+/*       INTEGER            INCX, N */
+/*       COMPLEX            A */
+/*       COMPLEX            X( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > CRSCL multiplies an n-element complex vector x by the complex scalar */
+/* > 1/a.  This is done without overflow or underflow as long as */
+/* > the final result x/a does not overflow or underflow. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of components of the vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX */
+/* >          The scalar a which is used to divide each component of x. */
+/* >          A must not be 0, or the subroutine will divide by zero. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX array, dimension */
+/* >                         (1+(N-1)*abs(INCX)) */
+/* >          The n-element vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of the vector X. */
+/* >          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complexOTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int crscl_(integer *n, complex *a, complex *x, integer *incx)
+{
+    /* System generated locals */
+    real r__1, r__2;
+    complex q__1;
+
+    /* Local variables */
+    real absi, absr;
+    extern /* Subroutine */ int cscal_(integer *, complex *, complex *, 
+	    integer *);
+    real ai, ar, ui, ov, ur;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int csscal_(integer *, real *, complex *, integer 
+	    *);
+    real safmin, safmax;
+    extern /* Subroutine */ int csrscl_(integer *, real *, complex *, integer 
+	    *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/* ===================================================================== */
+
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    safmin = slamch_("S");
+    safmax = 1.f / safmin;
+    ov = slamch_("O");
+
+/*     Initialize constants related to A. */
+
+    ar = a->r;
+    ai = r_imag(a);
+    absr = abs(ar);
+    absi = abs(ai);
+
+    if (ai == 0.f) {
+/*        If alpha is real, then we can use csrscl */
+	csrscl_(n, &ar, &x[1], incx);
+
+    } else if (ar == 0.f) {
+/*        If alpha has a zero real part, then we follow the same rules as if */
+/*        alpha were real. */
+	if (absi > safmax) {
+	    csscal_(n, &safmin, &x[1], incx);
+	    r__1 = -safmax / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	} else if (absi < safmin) {
+	    r__1 = -safmin / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	    csscal_(n, &safmax, &x[1], incx);
+	} else {
+	    r__1 = -1.f / ai;
+	    q__1.r = 0.f, q__1.i = r__1;
+	    cscal_(n, &q__1, &x[1], incx);
+	}
+
+    } else {
+/*        The following numbers can be computed. */
+/*        They are the inverse of the real and imaginary parts of 1/alpha. */
+/*        Note that a and b are always different from zero. */
+/*        NaNs are only possible if either: */
+/*        1. alphaR or alphaI is NaN. */
+/*        2. alphaR and alphaI are both infinite, in which case it makes sense */
+/*        to propagate a NaN. */
+	ur = ar + ai * (ai / ar);
+	ui = ai + ar * (ar / ai);
+
+	if (abs(ur) < safmin || abs(ui) < safmin) {
+/*           This means that both alphaR and alphaI are very small. */
+	    r__1 = safmin / ur;
+	    r__2 = -safmin / ui;
+	    q__1.r = r__1, q__1.i = r__2;
+	    cscal_(n, &q__1, &x[1], incx);
+	    csscal_(n, &safmax, &x[1], incx);
+	} else if (abs(ur) > safmax || abs(ui) > safmax) {
+	    if (absr > ov || absi > ov) {
+/*              This means that a and b are both Inf. No need for scaling. */
+		r__1 = 1.f / ur;
+		r__2 = -1.f / ui;
+		q__1.r = r__1, q__1.i = r__2;
+		cscal_(n, &q__1, &x[1], incx);
+	    } else {
+		csscal_(n, &safmin, &x[1], incx);
+		if (abs(ur) > ov || abs(ui) > ov) {
+/*                 Infs were generated. We do proper scaling to avoid them. */
+		    if (absr >= absi) {
+/*                    ABS( UR ) <= ABS( UI ) */
+			ur = safmin * ar + safmin * (ai * (ai / ar));
+			ui = safmin * ai + ar * (safmin * ar / ai);
+		    } else {
+/*                    ABS( UR ) > ABS( UI ) */
+			ur = safmin * ar + ai * (safmin * ai / ar);
+			ui = safmin * ai + safmin * (ar * (ar / ai));
+		    }
+		    r__1 = 1.f / ur;
+		    r__2 = -1.f / ui;
+		    q__1.r = r__1, q__1.i = r__2;
+		    cscal_(n, &q__1, &x[1], incx);
+		} else {
+		    r__1 = safmax / ur;
+		    r__2 = -safmax / ui;
+		    q__1.r = r__1, q__1.i = r__2;
+		    cscal_(n, &q__1, &x[1], incx);
+		}
+	    }
+	} else {
+	    r__1 = 1.f / ur;
+	    r__2 = -1.f / ui;
+	    q__1.r = r__1, q__1.i = r__2;
+	    cscal_(n, &q__1, &x[1], incx);
+	}
+    }
+
+    return 0;
+
+/*     End of CRSCL */
+
+} /* crscl_ */
+
diff --git a/lapack-netlib/SRC/crscl.f b/lapack-netlib/SRC/crscl.f
new file mode 100644
index 000000000..22919cd62
--- /dev/null
+++ b/lapack-netlib/SRC/crscl.f
@@ -0,0 +1,202 @@
+*> \brief \b CRSCL multiplies a vector by the reciprocal of a real scalar.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CRSCL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/crscl.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/crscl.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/crscl.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CRSCL( N, A, X, INCX )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX            A
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            X( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CRSCL multiplies an n-element complex vector x by the complex scalar
+*> 1/a.  This is done without overflow or underflow as long as
+*> the final result x/a does not overflow or underflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of components of the vector x.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX
+*>          The scalar a which is used to divide each component of x.
+*>          A must not be 0, or the subroutine will divide by zero.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          The n-element vector x.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of the vector X.
+*>          > 0:  X(1) = X(1) and X(1+(i-1)*INCX) = x(i),     1< i<= n
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexOTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE CRSCL( N, A, X, INCX )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX            A
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR
+     %                   , UI
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH
+      COMPLEX            CLADIV
+      EXTERNAL           SLAMCH, CLADIV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CSCAL, CSSCAL, CSRSCL
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.LE.0 )
+     $   RETURN
+*
+*     Get machine parameters
+*
+      SAFMIN = SLAMCH( 'S' )
+      SAFMAX = ONE / SAFMIN
+      OV   = SLAMCH( 'O' )
+*
+*     Initialize constants related to A.
+*
+      AR = REAL( A )
+      AI = AIMAG( A )
+      ABSR = ABS( AR )
+      ABSI = ABS( AI )
+*
+      IF( AI.EQ.ZERO ) THEN
+*        If alpha is real, then we can use csrscl
+         CALL CSRSCL( N, AR, X, INCX )
+*
+      ELSE IF( AR.EQ.ZERO ) THEN
+*        If alpha has a zero real part, then we follow the same rules as if
+*        alpha were real.
+         IF( ABSI.GT.SAFMAX ) THEN
+            CALL CSSCAL( N, SAFMIN, X, INCX )
+            CALL CSCAL( N, CMPLX( ZERO, -SAFMAX / AI ), X, INCX )
+         ELSE IF( ABSI.LT.SAFMIN ) THEN
+            CALL CSCAL( N, CMPLX( ZERO, -SAFMIN / AI ), X, INCX )
+            CALL CSSCAL( N, SAFMAX, X, INCX )
+         ELSE
+            CALL CSCAL( N, CMPLX( ZERO, -ONE / AI ), X, INCX )
+         END IF
+*
+      ELSE
+*        The following numbers can be computed.
+*        They are the inverse of the real and imaginary parts of 1/alpha.
+*        Note that a and b are always different from zero.
+*        NaNs are only possible if either:
+*        1. alphaR or alphaI is NaN.
+*        2. alphaR and alphaI are both infinite, in which case it makes sense
+*        to propagate a NaN.
+         UR = AR + AI * ( AI / AR )
+         UI = AI + AR * ( AR / AI )
+*
+         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
+*           This means that both alphaR and alphaI are very small.
+            CALL CSCAL( N, CMPLX( SAFMIN / UR, -SAFMIN / UI ), X, INCX )
+            CALL CSSCAL( N, SAFMAX, X, INCX )
+         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
+            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
+*              This means that a and b are both Inf. No need for scaling.
+               CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+            ELSE
+               CALL CSSCAL( N, SAFMIN, X, INCX )
+               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
+*                 Infs were generated. We do proper scaling to avoid them.
+                  IF( ABSR.GE.ABSI ) THEN
+*                    ABS( UR ) <= ABS( UI )
+                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
+                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
+                  ELSE
+*                    ABS( UR ) > ABS( UI )
+                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
+                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
+                  END IF
+                  CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+               ELSE
+                  CALL CSCAL( N, CMPLX( SAFMAX / UR, -SAFMAX / UI ),
+     $                        X, INCX )
+               END IF
+            END IF
+         ELSE
+            CALL CSCAL( N, CMPLX( ONE / UR, -ONE / UI ), X, INCX )
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of CRSCL
+*
+      END
diff --git a/lapack-netlib/SRC/cstedc.f b/lapack-netlib/SRC/cstedc.f
index a57d9eaef..d7db591b3 100644
--- a/lapack-netlib/SRC/cstedc.f
+++ b/lapack-netlib/SRC/cstedc.f
@@ -43,12 +43,6 @@
 *> be found if CHETRD or CHPTRD or CHBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See SLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
@@ -198,7 +192,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup stedc
 *
 *> \par Contributors:
 *  ==================
@@ -239,8 +233,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANST
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CLACPY, CLACRM, CLAED0, CSTEQR, CSWAP,
@@ -301,7 +295,7 @@
             LRWMIN = 1 + 4*N + 2*N**2
             LIWMIN = 3 + 5*N
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -472,7 +466,7 @@
       END IF
 *
    70 CONTINUE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
 *
diff --git a/lapack-netlib/SRC/cstegr.f b/lapack-netlib/SRC/cstegr.f
index 9d6e06da3..a162d5076 100644
--- a/lapack-netlib/SRC/cstegr.f
+++ b/lapack-netlib/SRC/cstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : CSTEGR and CSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cstemr.f b/lapack-netlib/SRC/cstemr.f
index d49684db3..46b20d880 100644
--- a/lapack-netlib/SRC/cstemr.f
+++ b/lapack-netlib/SRC/cstemr.f
@@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -329,7 +329,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE CSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -361,7 +362,8 @@
      $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -374,8 +376,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANST
-      EXTERNAL           LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARRV, CSWAP, SCOPY, SLAE2, SLAEV2, SLARRC,
@@ -397,6 +399,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -459,7 +462,7 @@
       RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( WANTZ .AND. ALLEIG ) THEN
@@ -519,6 +522,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -526,8 +538,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -550,8 +567,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -779,7 +801,7 @@
       ENDIF
 *
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/csyconvf.f b/lapack-netlib/SRC/csyconvf.f
index f3b46f0b4..febbc411c 100644
--- a/lapack-netlib/SRC/csyconvf.f
+++ b/lapack-netlib/SRC/csyconvf.f
@@ -39,7 +39,7 @@
 *> CSYTRF provided on entry in parameter A into the factorization
 *> output format used in CSYTRF_RK (or CSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in CSYTRF into
+*> the interchanges stored in IPIV from the format used in CSYTRF into
 *> the format used in CSYTRF_RK (or CSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or CSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in CSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in CSYTRF_RK
+*> the interchanges stored in IPIV from the format used in CSYTRF_RK
 *> (or CSYTRF_BK) into the format used in CSYTRF.
 *>
 *> CSYCONVF can also convert in Hermitian matrix case, i.e. between
@@ -325,7 +325,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -469,7 +469,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -535,7 +535,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/csyconvf_rook.f b/lapack-netlib/SRC/csyconvf_rook.f
index a5b9d82da..0da34e0fa 100644
--- a/lapack-netlib/SRC/csyconvf_rook.f
+++ b/lapack-netlib/SRC/csyconvf_rook.f
@@ -520,7 +520,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/csysv.f b/lapack-netlib/SRC/csysv.f
index 4ddabf62f..a2d1e7cbe 100644
--- a/lapack-netlib/SRC/csysv.f
+++ b/lapack-netlib/SRC/csysv.f
@@ -163,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesv
 *
 *  =====================================================================
       SUBROUTINE CSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -190,7 +190,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CSYTRF, CSYTRS, CSYTRS2
@@ -225,7 +226,7 @@
             CALL CSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -258,7 +259,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f
index 8548c2789..571a91123 100644
--- a/lapack-netlib/SRC/csysv_aa.f
+++ b/lapack-netlib/SRC/csysv_aa.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE CSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -182,7 +182,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CSYTRF_AA, CSYTRS_AA
@@ -217,7 +218,7 @@
      $                   -1, INFO )
          LWKOPT_SYTRS = INT( WORK(1) )
          LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -239,7 +240,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f
index d8881a748..10119d8ba 100644
--- a/lapack-netlib/SRC/csysv_aa_2stage.f
+++ b/lapack-netlib/SRC/csysv_aa_2stage.f
@@ -87,7 +87,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -177,7 +177,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE CSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -207,7 +207,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CSYTRF_AA_2STAGE, 
@@ -267,7 +268,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysv_rk.f b/lapack-netlib/SRC/csysv_rk.f
index ef5334dcd..cb98ab1dc 100644
--- a/lapack-netlib/SRC/csysv_rk.f
+++ b/lapack-netlib/SRC/csysv_rk.f
@@ -205,7 +205,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesv_rk
 *
 *> \par Contributors:
 *  ==================
@@ -247,7 +247,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CSYTRF_RK, CSYTRS_3
@@ -282,7 +283,7 @@
             CALL CSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -304,7 +305,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysv_rook.f b/lapack-netlib/SRC/csysv_rook.f
index aad594e21..8798ddfb2 100644
--- a/lapack-netlib/SRC/csysv_rook.f
+++ b/lapack-netlib/SRC/csysv_rook.f
@@ -181,7 +181,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesv_rook
 *
 *> \par Contributors:
 *  ==================
@@ -223,7 +223,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CSYTRF_ROOK, CSYTRS_ROOK
@@ -258,7 +259,7 @@
             CALL CSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -281,7 +282,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csysvx.f b/lapack-netlib/SRC/csysvx.f
index 2afa082a9..3c7a37889 100644
--- a/lapack-netlib/SRC/csysvx.f
+++ b/lapack-netlib/SRC/csysvx.f
@@ -276,7 +276,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE CSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -313,8 +313,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               CLANSY, SLAMCH
-      EXTERNAL           ILAENV, LSAME, CLANSY, SLAMCH
+      REAL               CLANSY, SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, CLANSY, SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACPY, CSYCON, CSYRFS, CSYTRF, CSYTRS, XERBLA
@@ -356,7 +356,7 @@
             NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -405,7 +405,7 @@
       IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
      $   INFO = N + 1
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f
index ebf228f18..519e78490 100644
--- a/lapack-netlib/SRC/csytrf.f
+++ b/lapack-netlib/SRC/csytrf.f
@@ -135,7 +135,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +202,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLASYF, CSYTF2, XERBLA
@@ -232,8 +233,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -351,7 +352,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CSYTRF
diff --git a/lapack-netlib/SRC/csytrf_aa.f b/lapack-netlib/SRC/csytrf_aa.f
index b1165a425..cf994913d 100644
--- a/lapack-netlib/SRC/csytrf_aa.f
+++ b/lapack-netlib/SRC/csytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -125,7 +125,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
       SUBROUTINE CSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
@@ -159,7 +159,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLASYF_AA, CGEMM, CGEMV, CSCAL, CSWAP, CCOPY,
@@ -191,7 +192,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          LWKOPT = (NB+1)*N
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -457,7 +458,7 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CSYTRF_AA
diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f
index 8a2cfd7bc..e56aedaf6 100644
--- a/lapack-netlib/SRC/csytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/csytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -188,7 +188,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CCOPY, CGBTRF, CGEMM, CGETRF, CLACPY,  
@@ -230,7 +231,7 @@
             TB( 1 ) = (3*NB+1)*N
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = SROUNDUP_LWORK(N*NB)
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
diff --git a/lapack-netlib/SRC/csytrf_rk.f b/lapack-netlib/SRC/csytrf_rk.f
index 9c2b7182f..de39bda41 100644
--- a/lapack-netlib/SRC/csytrf_rk.f
+++ b/lapack-netlib/SRC/csytrf_rk.f
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -280,7 +280,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLASYF_RK, CSYTF2_RK, CSWAP, XERBLA
@@ -310,8 +311,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'CSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -487,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CSYTRF_RK
diff --git a/lapack-netlib/SRC/csytrf_rook.f b/lapack-netlib/SRC/csytrf_rook.f
index ce7c1e586..72fe0629f 100644
--- a/lapack-netlib/SRC/csytrf_rook.f
+++ b/lapack-netlib/SRC/csytrf_rook.f
@@ -146,7 +146,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
@@ -228,7 +228,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLASYF_ROOK, CSYTF2_ROOK, XERBLA
@@ -259,7 +260,7 @@
 *
          NB = ILAENV( 1, 'CSYTRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -382,7 +383,7 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CSYTRF_ROOK
diff --git a/lapack-netlib/SRC/csytri_3.f b/lapack-netlib/SRC/csytri_3.f
index 279f62853..604d84b21 100644
--- a/lapack-netlib/SRC/csytri_3.f
+++ b/lapack-netlib/SRC/csytri_3.f
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetri_3
 *
 *> \par Contributors:
 *  ==================
@@ -190,7 +190,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CSYTRI_3X, XERBLA
@@ -225,7 +226,7 @@
          CALL XERBLA( 'CSYTRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -236,7 +237,7 @@
 *
       CALL CSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/csytrs_aa.f b/lapack-netlib/SRC/csytrs_aa.f
index 1f6ea40af..7f63539a6 100644
--- a/lapack-netlib/SRC/csytrs_aa.f
+++ b/lapack-netlib/SRC/csytrs_aa.f
@@ -123,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE CSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -155,7 +155,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACPY, CGTSV, CSWAP, CTRSM, XERBLA
@@ -186,7 +187,7 @@
          RETURN
       ELSE IF( LQUERY ) THEN
          LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/ctgevc.f b/lapack-netlib/SRC/ctgevc.f
index 4e5289cb2..22144f259 100644
--- a/lapack-netlib/SRC/ctgevc.f
+++ b/lapack-netlib/SRC/ctgevc.f
@@ -53,7 +53,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal elements of S and P.
 *>
@@ -154,7 +154,7 @@
 *> \verbatim
 *>          VR is COMPLEX array, dimension (LDVR,MM)
 *>          On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
-*>          contain an N-by-N matrix Q (usually the unitary matrix Z
+*>          contain an N-by-N matrix Z (usually the unitary matrix Z
 *>          of right Schur vectors returned by CHGEQZ).
 *>          On exit, if SIDE = 'R' or 'B', VR contains:
 *>          if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P);
@@ -259,7 +259,7 @@
       EXTERNAL           LSAME, SLAMCH, CLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGEMV, SLABAD, XERBLA
+      EXTERNAL           CGEMV, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, CMPLX, CONJG, MAX, MIN, REAL
@@ -367,7 +367,6 @@
 *
       SAFMIN = SLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, BIG )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/ctgsen.f b/lapack-netlib/SRC/ctgsen.f
index f9b6cd10c..180e96b32 100644
--- a/lapack-netlib/SRC/ctgsen.f
+++ b/lapack-netlib/SRC/ctgsen.f
@@ -290,7 +290,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup tgsen
 *
 *> \par Further Details:
 *  =====================
@@ -339,7 +339,7 @@
 *>            [ kron(In2, B11)  -kron(B22**H, In1) ].
 *>
 *>  Here, Inx is the identity matrix of size nx and A22**H is the
-*>  conjuguate transpose of A22. kron(X, Y) is the Kronecker product between
+*>  conjugate transpose of A22. kron(X, Y) is the Kronecker product between
 *>  the matrices X and Y.
 *>
 *>  When DIF(2) is small, small changes in (A, B) can cause large changes
@@ -467,6 +467,10 @@
 *     .. Local Arrays ..
       INTEGER            ISAVE( 3 )
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       REAL               SLAMCH
       EXTERNAL           CLACN2, CLACPY, CLASSQ, CSCAL, CTGEXC, CTGSYL,
@@ -537,7 +541,7 @@
          LIWMIN = 1
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) =  SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -771,7 +775,7 @@
 *
    70 CONTINUE
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) =  SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ctgsna.f b/lapack-netlib/SRC/ctgsna.f
index 2295dc5cc..50498c413 100644
--- a/lapack-netlib/SRC/ctgsna.f
+++ b/lapack-netlib/SRC/ctgsna.f
@@ -213,7 +213,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup tgsna
 *
 *> \par Further Details:
 *  =====================
@@ -343,12 +343,13 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SCNRM2, SLAMCH, SLAPY2
+      REAL               SCNRM2, SLAMCH, SLAPY2, SROUNDUP_LWORK
       COMPLEX            CDOTC
-      EXTERNAL           LSAME, SCNRM2, SLAMCH, SLAPY2, CDOTC
+      EXTERNAL           LSAME, SCNRM2, SLAMCH, SLAPY2, SROUNDUP_LWORK,
+     $                   CDOTC
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGEMV, CLACPY, CTGEXC, CTGSYL, SLABAD, XERBLA
+      EXTERNAL           CGEMV, CLACPY, CTGEXC, CTGSYL, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, CMPLX, MAX
@@ -402,7 +403,7 @@
          ELSE
             LWMIN = N
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( MM.LT.M ) THEN
             INFO = -15
@@ -428,7 +429,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' ) / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       KS = 0
       DO 20 K = 1, N
 *
@@ -508,7 +508,7 @@
          END IF
 *
    20 CONTINUE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of CTGSNA
diff --git a/lapack-netlib/SRC/ctgsyl.f b/lapack-netlib/SRC/ctgsyl.f
index ae1437125..620556399 100644
--- a/lapack-netlib/SRC/ctgsyl.f
+++ b/lapack-netlib/SRC/ctgsyl.f
@@ -260,7 +260,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexSYcomputational
+*> \ingroup tgsyl
 *
 *> \par Contributors:
 *  ==================
@@ -329,7 +329,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGEMM, CLACPY, CLASET, CSCAL, CTGSY2, XERBLA
@@ -382,7 +383,7 @@
          ELSE
             LWMIN = 1
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -683,7 +684,7 @@
   210    CONTINUE
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f
index 0f58696b2..13cbf553f 100644
--- a/lapack-netlib/SRC/ctrevc3.f
+++ b/lapack-netlib/SRC/ctrevc3.f
@@ -222,7 +222,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup trevc3
 *
 *> \par Further Details:
 *  =====================
@@ -278,12 +278,13 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV, ICAMAX
-      REAL               SLAMCH, SCASUM
-      EXTERNAL           LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM
+      REAL               SLAMCH, SCASUM, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, ICAMAX, SLAMCH, SCASUM,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, CCOPY, CLASET, CSSCAL, CGEMM, CGEMV,
-     $                   CLATRS, CLACPY, SLABAD
+     $                   CLATRS, CLACPY
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, CMPLX, CONJG, AIMAG, MAX
@@ -321,9 +322,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'CTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
-      WORK(1) = MAXWRK
-      RWORK(1) = N
+      MAXWRK = MAX( 1, N + 2*N*NB )
+      WORK(1) = SROUNDUP_LWORK(MAXWRK)
+      RWORK(1) = MAX( 1, N )
       LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
          INFO = -1
@@ -371,7 +372,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Precision' )
       SMLNUM = UNFL*( N / ULP )
 *
diff --git a/lapack-netlib/SRC/ctrexc.f b/lapack-netlib/SRC/ctrexc.f
index ea64ddf9b..2bc0348fb 100644
--- a/lapack-netlib/SRC/ctrexc.f
+++ b/lapack-netlib/SRC/ctrexc.f
@@ -40,7 +40,7 @@
 *>
 *> The Schur form T is reordered by a unitary similarity transformation
 *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
-*> postmultplying it with Z.
+*> postmultiplying it with Z.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ctrsen.f b/lapack-netlib/SRC/ctrsen.f
index d93b97be6..9d59f6bf2 100644
--- a/lapack-netlib/SRC/ctrsen.f
+++ b/lapack-netlib/SRC/ctrsen.f
@@ -182,7 +182,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup trsen
 *
 *> \par Further Details:
 *  =====================
@@ -293,8 +293,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               CLANGE
-      EXTERNAL           LSAME, CLANGE
+      REAL               CLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, CLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLACN2, CLACPY, CTREXC, CTRSYL, XERBLA
@@ -350,7 +350,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -444,7 +444,7 @@
          W( K ) = T( K, K )
    50 CONTINUE
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ctzrzf.f b/lapack-netlib/SRC/ctzrzf.f
index b21f092ce..ac3f59400 100644
--- a/lapack-netlib/SRC/ctzrzf.f
+++ b/lapack-netlib/SRC/ctzrzf.f
@@ -116,7 +116,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup tzrzf
 *
 *> \par Contributors:
 *  ==================
@@ -179,7 +179,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -207,7 +208,7 @@
             LWKOPT = M*NB
             LWKMIN = MAX( 1, M )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -7
@@ -301,7 +302,7 @@
       IF( MU.GT.0 )
      $   CALL CLATRZ( MU, N, N-M, A, LDA, TAU, WORK )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cunbdb.f b/lapack-netlib/SRC/cunbdb.f
index a41895dc8..b45dcfde6 100644
--- a/lapack-netlib/SRC/cunbdb.f
+++ b/lapack-netlib/SRC/cunbdb.f
@@ -255,7 +255,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb
 *
 *> \par Further Details:
 *  =====================
@@ -320,9 +320,9 @@
 *
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
       LOGICAL            LSAME
-      EXTERNAL           SCNRM2, LSAME
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK, LSAME
 *     ..
 *     .. Intrinsic Functions
       INTRINSIC          ATAN2, COS, MAX, MIN, SIN
@@ -377,7 +377,7 @@
       IF( INFO .EQ. 0 ) THEN
          LWORKOPT = M - Q
          LWORKMIN = M - Q
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT. LQUERY ) THEN
             INFO = -21
          END IF
diff --git a/lapack-netlib/SRC/cunbdb1.f b/lapack-netlib/SRC/cunbdb1.f
index 740e38a85..a4875ab5b 100644
--- a/lapack-netlib/SRC/cunbdb1.f
+++ b/lapack-netlib/SRC/cunbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> CUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb1
 *
 *> \par Further Details:
 *  =====================
@@ -230,8 +230,8 @@
       EXTERNAL           CLACGV
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          ATAN2, COS, MAX, SIN, SQRT
@@ -264,7 +264,7 @@
          LORBDB5 = Q-2
          LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 )
          LWORKMIN = LWORKOPT
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
            INFO = -14
          END IF
diff --git a/lapack-netlib/SRC/cunbdb2.f b/lapack-netlib/SRC/cunbdb2.f
index b45db6100..6399964f8 100644
--- a/lapack-netlib/SRC/cunbdb2.f
+++ b/lapack-netlib/SRC/cunbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> CUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb2
 *
 *> \par Further Details:
 *  =====================
@@ -231,8 +231,8 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          ATAN2, COS, MAX, SIN, SQRT
@@ -265,7 +265,7 @@
          LORBDB5 = Q-1
          LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 )
          LWORKMIN = LWORKOPT
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
            INFO = -14
          END IF
diff --git a/lapack-netlib/SRC/cunbdb3.f b/lapack-netlib/SRC/cunbdb3.f
index b532bfbc8..d02460597 100644
--- a/lapack-netlib/SRC/cunbdb3.f
+++ b/lapack-netlib/SRC/cunbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> CUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb3
 *
 *> \par Further Details:
 *  =====================
@@ -229,8 +229,8 @@
       EXTERNAL           CLARF, CLARFGP, CUNBDB5, CSROT, CLACGV, XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          ATAN2, COS, MAX, SIN, SQRT
@@ -263,7 +263,7 @@
          LORBDB5 = Q-1
          LWORKOPT = MAX( ILARF+LLARF-1, IORBDB5+LORBDB5-1 )
          LWORKMIN = LWORKOPT
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
            INFO = -14
          END IF
diff --git a/lapack-netlib/SRC/cunbdb4.f b/lapack-netlib/SRC/cunbdb4.f
index 117f23d08..33acc1ee5 100644
--- a/lapack-netlib/SRC/cunbdb4.f
+++ b/lapack-netlib/SRC/cunbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> CUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
@@ -183,7 +183,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb4
 *
 *> \par Further Details:
 *  =====================
@@ -242,8 +242,8 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SCNRM2, SROUNDUP_LWORK
+      EXTERNAL           SCNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          ATAN2, COS, MAX, SIN, SQRT
@@ -277,7 +277,7 @@
          LWORKOPT = ILARF + LLARF - 1
          LWORKOPT = MAX( LWORKOPT, IORBDB5 + LORBDB5 - 1 )
          LWORKMIN = LWORKOPT
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
            INFO = -14
          END IF
diff --git a/lapack-netlib/SRC/cunbdb5.f b/lapack-netlib/SRC/cunbdb5.f
index d2ff4e700..22513cf8b 100644
--- a/lapack-netlib/SRC/cunbdb5.f
+++ b/lapack-netlib/SRC/cunbdb5.f
@@ -148,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb5
 *
 *  =====================================================================
       SUBROUTINE CUNBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -169,18 +169,21 @@
 *  =====================================================================
 *
 *     .. Parameters ..
+      REAL               REALZERO
+      PARAMETER          ( REALZERO = 0.0E0 )
       COMPLEX            ONE, ZERO
       PARAMETER          ( ONE = (1.0E0,0.0E0), ZERO = (0.0E0,0.0E0) )
 *     ..
 *     .. Local Scalars ..
       INTEGER            CHILDINFO, I, J
+      REAL               EPS, NORM, SCL, SSQ
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CUNBDB6, XERBLA
+      EXTERNAL           CLASSQ, CUNBDB6, CSCAL, XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SCNRM2
-      EXTERNAL           SCNRM2
+      REAL               SLAMCH, SCNRM2
+      EXTERNAL           SLAMCH, SCNRM2
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          MAX
@@ -213,16 +216,33 @@
          RETURN
       END IF
 *
-*     Project X onto the orthogonal complement of Q
+      EPS = SLAMCH( 'Precision' )
 *
-      CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2,
-     $              WORK, LWORK, CHILDINFO )
+*     Project X onto the orthogonal complement of Q if X is nonzero
 *
-*     If the projection is nonzero, then return
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL CLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL CLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
 *
-      IF( SCNRM2(M1,X1,INCX1) .NE. ZERO
-     $    .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
-         RETURN
+      IF( NORM .GT. N * EPS ) THEN
+*        Scale vector to unit norm to avoid problems in the caller code.
+*        Computing the reciprocal is undesirable but
+*         * xLASCL cannot be used because of the vector increments and
+*         * the round-off error has a negligible impact on
+*           orthogonalization.
+         CALL CSCAL( M1, ONE / NORM, X1, INCX1 )
+         CALL CSCAL( M2, ONE / NORM, X2, INCX2 )
+         CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
+     $              LDQ2, WORK, LWORK, CHILDINFO )
+*
+*        If the projection is nonzero, then return
+*
+         IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
+            RETURN
+         END IF
       END IF
 *
 *     Project each standard basis vector e_1,...,e_M1 in turn, stopping
@@ -238,8 +258,8 @@
          END DO
          CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( SCNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
@@ -257,8 +277,8 @@
          X2(I) = ONE
          CALL CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( SCNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. SCNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( SCNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SCNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
diff --git a/lapack-netlib/SRC/cunbdb6.f b/lapack-netlib/SRC/cunbdb6.f
index b93a389d6..566fd76b7 100644
--- a/lapack-netlib/SRC/cunbdb6.f
+++ b/lapack-netlib/SRC/cunbdb6.f
@@ -41,9 +41,8 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The Euclidean norm of X must be one and the columns of Q must be
-*> orthonormal. The orthogonalized vector will be zero if and only if it
-*> lies entirely in the range of Q.
+*> The columns of Q must be orthonormal. The orthogonalized vector will
+*> be zero if and only if it lies entirely in the range of Q.
 *>
 *> The projection is computed with at most two iterations of the
 *> classical Gram-Schmidt algorithm, see
@@ -152,7 +151,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unbdb6
 *
 *  =====================================================================
       SUBROUTINE CUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -174,7 +173,7 @@
 *
 *     .. Parameters ..
       REAL               ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.01E0, REALONE = 1.0E0,
+      PARAMETER          ( ALPHA = 0.83E0, REALONE = 1.0E0,
      $                     REALZERO = 0.0E0 )
       COMPLEX            NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = (-1.0E0,0.0E0), ONE = (1.0E0,0.0E0),
@@ -223,14 +222,16 @@
 *
       EPS = SLAMCH( 'Precision' )
 *
+*     Compute the Euclidean norm of X
+*
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL CLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL CLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
+*
 *     First, project X onto the orthogonal complement of Q's column
 *     space
-*
-*     Christoph Conrads: In debugging mode the norm should be computed
-*     and an assertion added comparing the norm with one. Alas, Fortran
-*     never made it into 1989 when assert() was introduced into the C
-*     programming language.
-      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
diff --git a/lapack-netlib/SRC/cuncsd.f b/lapack-netlib/SRC/cuncsd.f
index 3653a396a..003daaab4 100644
--- a/lapack-netlib/SRC/cuncsd.f
+++ b/lapack-netlib/SRC/cuncsd.f
@@ -308,7 +308,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup uncsd
 *
 *  =====================================================================
       RECURSIVE SUBROUTINE CUNCSD( JOBU1, JOBU2, JOBV1T, JOBV2T, TRANS,
@@ -365,7 +365,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions
       INTRINSIC          INT, MAX, MIN
@@ -504,7 +505,8 @@
      $              IORBDB + LORBDBWORKOPT ) - 1
          LWORKMIN = MAX( IORGQR + LORGQRWORKMIN, IORGLQ + LORGLQWORKMIN,
      $              IORBDB + LORBDBWORKMIN ) - 1
-         WORK(1) = MAX(LWORKOPT,LWORKMIN)
+         LWORKOPT = MAX(LWORKOPT,LWORKMIN)
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
 *
          IF( LWORK .LT. LWORKMIN
      $       .AND. .NOT. ( LQUERY .OR. LRQUERY ) ) THEN
diff --git a/lapack-netlib/SRC/cuncsd2by1.f b/lapack-netlib/SRC/cuncsd2by1.f
index 8360d5932..128e82cec 100644
--- a/lapack-netlib/SRC/cuncsd2by1.f
+++ b/lapack-netlib/SRC/cuncsd2by1.f
@@ -212,13 +212,13 @@
 *>          LRWORK is INTEGER
 *>          The dimension of the array RWORK.
 *>
-*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          If LRWORK=-1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK and RWORK
 *>          arrays, returns this value as the first entry of the WORK
 *>          and RWORK array, respectively, and no error message related
 *>          to LWORK or LRWORK is issued by XERBLA.
 *> \endverbatim
-*
+*>
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q))
@@ -247,7 +247,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup uncsd2by1
 *
 *  =====================================================================
       SUBROUTINE CUNCSD2BY1( JOBU1, JOBU2, JOBV1T, M, P, Q, X11, LDX11,
@@ -299,7 +299,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          INT, MAX, MIN
@@ -508,7 +509,7 @@
          LWORKOPT = MAX( IORBDB+LORBDB-1,
      $                   IORGQR+LORGQROPT-1,
      $                   IORGLQ+LORGLQOPT-1 )
-         WORK(1) = LWORKOPT
+         WORK(1) = SROUNDUP_LWORK(LWORKOPT)
          IF( LWORK .LT. LWORKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -19
          END IF
diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f
index a31a53d79..2f0208fdb 100644
--- a/lapack-netlib/SRC/cungbr.f
+++ b/lapack-netlib/SRC/cungbr.f
@@ -150,7 +150,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexGBcomputational
+*> \ingroup ungbr
 *
 *  =====================================================================
       SUBROUTINE CUNGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -180,7 +180,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNGLQ, CUNGQR, XERBLA
@@ -241,7 +242,7 @@
          CALL XERBLA( 'CUNGBR', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -327,7 +328,7 @@
             END IF
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNGBR
diff --git a/lapack-netlib/SRC/cunghr.f b/lapack-netlib/SRC/cunghr.f
index 4f8a0a263..3aa3fb1ae 100644
--- a/lapack-netlib/SRC/cunghr.f
+++ b/lapack-netlib/SRC/cunghr.f
@@ -119,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unghr
 *
 *  =====================================================================
       SUBROUTINE CUNGHR( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO )
@@ -151,7 +151,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -178,7 +179,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'CUNGQR', ' ', NH, NH, NH, -1 )
          LWKOPT = MAX( 1, NH )*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -230,7 +231,7 @@
          CALL CUNGQR( NH, NH, NH, A( ILO+1, ILO+1 ), LDA, TAU( ILO ),
      $                WORK, LWORK, IINFO )
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNGHR
diff --git a/lapack-netlib/SRC/cunglq.f b/lapack-netlib/SRC/cunglq.f
index e250e036c..353715054 100644
--- a/lapack-netlib/SRC/cunglq.f
+++ b/lapack-netlib/SRC/cunglq.f
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unglq
 *
 *  =====================================================================
       SUBROUTINE CUNGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -155,7 +155,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -164,7 +165,7 @@
       INFO = 0
       NB = ILAENV( 1, 'CUNGLQ', ' ', M, N, K, -1 )
       LWKOPT = MAX( 1, M )*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -278,7 +279,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CUNGLQ
diff --git a/lapack-netlib/SRC/cungql.f b/lapack-netlib/SRC/cungql.f
index d3b812a62..ed2f6803c 100644
--- a/lapack-netlib/SRC/cungql.f
+++ b/lapack-netlib/SRC/cungql.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ungql
 *
 *  =====================================================================
       SUBROUTINE CUNGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,7 +182,7 @@
             NB = ILAENV( 1, 'CUNGQL', ' ', M, N, K, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
diff --git a/lapack-netlib/SRC/cungqr.f b/lapack-netlib/SRC/cungqr.f
index 5010ae0df..b6e8cc59a 100644
--- a/lapack-netlib/SRC/cungqr.f
+++ b/lapack-netlib/SRC/cungqr.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ungqr
 *
 *  =====================================================================
       SUBROUTINE CUNGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -165,7 +166,7 @@
       INFO = 0
       NB = ILAENV( 1, 'CUNGQR', ' ', M, N, K, -1 )
       LWKOPT = MAX( 1, N )*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -279,7 +280,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CUNGQR
diff --git a/lapack-netlib/SRC/cungrq.f b/lapack-netlib/SRC/cungrq.f
index 1593ff938..aceaac0b8 100644
--- a/lapack-netlib/SRC/cungrq.f
+++ b/lapack-netlib/SRC/cungrq.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ungrq
 *
 *  =====================================================================
       SUBROUTINE CUNGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,7 +182,7 @@
             NB = ILAENV( 1, 'CUNGRQ', ' ', M, N, K, -1 )
             LWKOPT = M*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
@@ -286,7 +287,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of CUNGRQ
diff --git a/lapack-netlib/SRC/cungtr.f b/lapack-netlib/SRC/cungtr.f
index 26ff0428e..27f197340 100644
--- a/lapack-netlib/SRC/cungtr.f
+++ b/lapack-netlib/SRC/cungtr.f
@@ -116,7 +116,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup ungtr
 *
 *  =====================================================================
       SUBROUTINE CUNGTR( UPLO, N, A, LDA, TAU, WORK, LWORK, INFO )
@@ -147,7 +147,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNGQL, CUNGQR, XERBLA
@@ -179,7 +180,7 @@
            NB = ILAENV( 1, 'CUNGQR', ' ', N-1, N-1, N-1, -1 )
          END IF
          LWKOPT = MAX( 1, N-1 )*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -245,7 +246,7 @@
      $                   LWORK, IINFO )
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNGTR
diff --git a/lapack-netlib/SRC/cungtsqr.f b/lapack-netlib/SRC/cungtsqr.f
index 64ccb9731..1734be2aa 100644
--- a/lapack-netlib/SRC/cungtsqr.f
+++ b/lapack-netlib/SRC/cungtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -302,4 +303,4 @@
 *
 *     End of CUNGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/cunmbr.f b/lapack-netlib/SRC/cunmbr.f
index cef6025b0..a21c486e9 100644
--- a/lapack-netlib/SRC/cunmbr.f
+++ b/lapack-netlib/SRC/cunmbr.f
@@ -189,7 +189,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmbr
 *
 *  =====================================================================
       SUBROUTINE CUNMBR( VECT, SIDE, TRANS, M, N, K, A, LDA, TAU, C,
@@ -218,7 +218,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNMLQ, CUNMQR, XERBLA
@@ -290,7 +291,7 @@
          ELSE
             LWKOPT = 1
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -367,7 +368,7 @@
      $                   TAU, C( I1, I2 ), LDC, WORK, LWORK, IINFO )
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMBR
diff --git a/lapack-netlib/SRC/cunmhr.f b/lapack-netlib/SRC/cunmhr.f
index af3140d5f..29bb631f1 100644
--- a/lapack-netlib/SRC/cunmhr.f
+++ b/lapack-netlib/SRC/cunmhr.f
@@ -171,7 +171,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmhr
 *
 *  =====================================================================
       SUBROUTINE CUNMHR( SIDE, TRANS, M, N, ILO, IHI, A, LDA, TAU, C,
@@ -199,7 +199,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNMQR, XERBLA
@@ -253,7 +254,7 @@
             NB = ILAENV( 1, 'CUNMQR', SIDE // TRANS, M, NH, NH, -1 )
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -285,7 +286,7 @@
       CALL CUNMQR( SIDE, TRANS, MI, NI, NH, A( ILO+1, ILO ), LDA,
      $             TAU( ILO ), C( I1, I2 ), LDC, WORK, LWORK, IINFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMHR
diff --git a/lapack-netlib/SRC/cunmlq.f b/lapack-netlib/SRC/cunmlq.f
index 25a410770..4da1af1d5 100644
--- a/lapack-netlib/SRC/cunmlq.f
+++ b/lapack-netlib/SRC/cunmlq.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmlq
 *
 *  =====================================================================
       SUBROUTINE CUNMLQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -195,7 +195,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARFB, CLARFT, CUNML2, XERBLA
@@ -250,7 +251,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -343,7 +344,7 @@
      $                   C( IC, JC ), LDC, WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMLQ
diff --git a/lapack-netlib/SRC/cunmql.f b/lapack-netlib/SRC/cunmql.f
index 3c7166066..84fc29d32 100644
--- a/lapack-netlib/SRC/cunmql.f
+++ b/lapack-netlib/SRC/cunmql.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmql
 *
 *  =====================================================================
       SUBROUTINE CUNMQL( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -194,7 +194,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARFB, CLARFT, CUNM2L, XERBLA
@@ -249,7 +250,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -332,7 +333,7 @@
      $                   WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMQL
diff --git a/lapack-netlib/SRC/cunmqr.f b/lapack-netlib/SRC/cunmqr.f
index 7e59d7129..7d85a861f 100644
--- a/lapack-netlib/SRC/cunmqr.f
+++ b/lapack-netlib/SRC/cunmqr.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmqr
 *
 *  =====================================================================
       SUBROUTINE CUNMQR( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -194,7 +194,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARFB, CLARFT, CUNM2R, XERBLA
@@ -245,7 +246,7 @@
          NB = MIN( NBMAX, ILAENV( 1, 'CUNMQR', SIDE // TRANS, M, N, K,
      $        -1 ) )
          LWKOPT = NW*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -331,7 +332,7 @@
      $                   C( IC, JC ), LDC, WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMQR
diff --git a/lapack-netlib/SRC/cunmrq.f b/lapack-netlib/SRC/cunmrq.f
index 5a233f604..f02cfd9a9 100644
--- a/lapack-netlib/SRC/cunmrq.f
+++ b/lapack-netlib/SRC/cunmrq.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmrq
 *
 *  =====================================================================
       SUBROUTINE CUNMRQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -195,7 +195,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARFB, CLARFT, CUNMR2, XERBLA
@@ -250,7 +251,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -337,7 +338,7 @@
      $                   WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMRQ
diff --git a/lapack-netlib/SRC/cunmrz.f b/lapack-netlib/SRC/cunmrz.f
index 8e06f2329..9ccf1878b 100644
--- a/lapack-netlib/SRC/cunmrz.f
+++ b/lapack-netlib/SRC/cunmrz.f
@@ -168,7 +168,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmrz
 *
 *> \par Contributors:
 *  ==================
@@ -213,7 +213,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARZB, CLARZT, CUNMR3, XERBLA
@@ -271,7 +272,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -371,7 +372,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/cunmtr.f b/lapack-netlib/SRC/cunmtr.f
index 097dba91e..6eafc15c4 100644
--- a/lapack-netlib/SRC/cunmtr.f
+++ b/lapack-netlib/SRC/cunmtr.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unmtr
 *
 *  =====================================================================
       SUBROUTINE CUNMTR( SIDE, UPLO, TRANS, M, N, A, LDA, TAU, C, LDC,
@@ -192,7 +192,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CUNMQL, CUNMQR, XERBLA
@@ -256,7 +257,7 @@
             END IF
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -301,7 +302,7 @@
          CALL CUNMQR( SIDE, TRANS, MI, NI, NQ-1, A( 2, 1 ), LDA, TAU,
      $                C( I1, I2 ), LDC, WORK, LWORK, IINFO )
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of CUNMTR
diff --git a/lapack-netlib/SRC/dbbcsd.f b/lapack-netlib/SRC/dbbcsd.f
index 4fcf9ab5d..913f96a73 100644
--- a/lapack-netlib/SRC/dbbcsd.f
+++ b/lapack-netlib/SRC/dbbcsd.f
@@ -805,7 +805,7 @@
                CALL DLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1),
      $                       WORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL DLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1),
+               CALL DLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1),
      $                       WORK(IU2SN+I-1) )
             ELSE
                CALL DLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1),
diff --git a/lapack-netlib/SRC/dbdsdc.f b/lapack-netlib/SRC/dbdsdc.f
index 99fe82296..4b6c3e694 100644
--- a/lapack-netlib/SRC/dbdsdc.f
+++ b/lapack-netlib/SRC/dbdsdc.f
@@ -45,13 +45,6 @@
 *> respectively. DBDSDC can be used to compute all singular values,
 *> and optionally, singular vectors or singular vectors in compact form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See DLASD3 for details.
-*>
 *> The code currently calls DLASDQ if singular values only are desired.
 *> However, it can be slightly modified to compute singular values
 *> using the divide and conquer method.
diff --git a/lapack-netlib/SRC/dbdsqr.f b/lapack-netlib/SRC/dbdsqr.f
index c220a5875..bc697a007 100644
--- a/lapack-netlib/SRC/dbdsqr.f
+++ b/lapack-netlib/SRC/dbdsqr.f
@@ -278,7 +278,7 @@
      $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       DOUBLE PRECISION   ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL, SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -391,7 +391,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -451,7 +451,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -460,7 +459,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -542,14 +540,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -570,14 +568,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -587,7 +585,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/dbdsvdx.f b/lapack-netlib/SRC/dbdsvdx.f
index 4ccd4edad..4668a88f2 100644
--- a/lapack-netlib/SRC/dbdsvdx.f
+++ b/lapack-netlib/SRC/dbdsvdx.f
@@ -45,7 +45,7 @@
 *>
 *>  Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ]
 *>  and superdiagonal E = [ e_1 e_2 ... e_N-1 ], DBDSVDX computes the
-*>  singular value decompositon of B through the eigenvalues and
+*>  singular value decomposition of B through the eigenvalues and
 *>  eigenvectors of the N*2-by-N*2 tridiagonal matrix
 *>
 *>        |  0  d_1                |
diff --git a/lapack-netlib/SRC/dgebal.f b/lapack-netlib/SRC/dgebal.f
index 821c7704a..f7b38b378 100644
--- a/lapack-netlib/SRC/dgebal.f
+++ b/lapack-netlib/SRC/dgebal.f
@@ -153,6 +153,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -181,8 +184,8 @@
       PARAMETER          ( FACTOR = 0.95D+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       DOUBLE PRECISION   C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -214,177 +217,192 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL DSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL DSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
-*
-   50 CONTINUE
-      DO 70 J = L, 1, -1
+*     Permutation to isolate eigenvalues if possible.
 *
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( A( J, I ).NE.ZERO )
-     $         GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL DSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL DSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL DSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL DSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( A( I, J ).NE.ZERO )
-     $         GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
 *
-  140 CONTINUE
-      NOCONV = .FALSE.
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-      DO 200 I = K, L
+         DO I = K, L
 *
-         C = DNRM2( L-K+1, A( K, I ), 1 )
-         R = DNRM2( L-K+1, A( I, K ), LDA )
-         ICA = IDAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = IDAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
+            C = DNRM2( L-K+1, A( K, I ), 1 )
+            R = DNRM2( L-K+1, A( I, K ), LDA )
+            ICA = IDAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = IDAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-*        Guard against zero C or R due to underflow.
+*           Guard against zero C or R due to underflow.
 *
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-            IF( DISNAN( C+F+CA+R+G+RA ) ) THEN
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
 *
 *           Exit if NaN to avoid infinite loop
 *
-            INFO = -3
-            CALL XERBLA( 'DGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
-*
-         CALL DSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL DSCAL( L, F, A( 1, I ), 1 )
-*
-  200 CONTINUE
-*
-      IF( NOCONV )
-     $   GO TO 140
+            IF( DISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'DGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL DSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL DSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/dgebrd.f b/lapack-netlib/SRC/dgebrd.f
index 0f0d1651a..ac11d48a0 100644
--- a/lapack-netlib/SRC/dgebrd.f
+++ b/lapack-netlib/SRC/dgebrd.f
@@ -122,7 +122,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -147,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -223,8 +224,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEBD2, DGEMM, DLABRD, XERBLA
@@ -241,9 +242,17 @@
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'DGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      ENDIF
       WORK( 1 ) = DBLE( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -251,7 +260,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -263,7 +272,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -282,7 +290,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
diff --git a/lapack-netlib/SRC/dgecon.f b/lapack-netlib/SRC/dgecon.f
index 1ad302ae3..a543eb03a 100644
--- a/lapack-netlib/SRC/dgecon.f
+++ b/lapack-netlib/SRC/dgecon.f
@@ -105,8 +105,15 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
-*>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          =-5:  if ANORM is NAN or negative.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>                NaNs are illegal values for ANORM, and they propagate to
+*>                the output parameter RCOND.
+*>                Infinity is illegal for ANORM, and it propagates to the output
+*>                parameter RCOND as 0.
+*>          = 1:  if RCOND = NaN, or
+*>                   RCOND = Inf, or
+*>                   the computed norm of the inverse of A is 0.
+*>                In the latter, RCOND = 0 is returned.
 *> \endverbatim
 *
 *  Authors:
@@ -117,7 +124,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gecon
 *
 *  =====================================================================
       SUBROUTINE DGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK,
@@ -147,7 +154,7 @@
       LOGICAL            ONENRM
       CHARACTER          NORMIN
       INTEGER            IX, KASE, KASE1
-      DOUBLE PRECISION   AINVNM, SCALE, SL, SMLNUM, SU
+      DOUBLE PRECISION   AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL
 *     ..
 *     .. Local Arrays ..
       INTEGER            ISAVE( 3 )
@@ -165,6 +172,8 @@
       INTRINSIC          ABS, MAX
 *     ..
 *     .. Executable Statements ..
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
 *
 *     Test the input parameters.
 *
@@ -176,7 +185,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN
+      ELSE IF( ANORM.LT.ZERO ) THEN
          INFO = -5
       END IF
       IF( INFO.NE.0 ) THEN
@@ -192,6 +201,13 @@
          RETURN
       ELSE IF( ANORM.EQ.ZERO ) THEN
          RETURN
+      ELSE IF( DISNAN( ANORM ) ) THEN
+         RCOND = ANORM
+         INFO = -5
+         RETURN
+      ELSE IF( ANORM.GT.HUGEVAL ) THEN
+         INFO = -5
+         RETURN
       END IF
 *
       SMLNUM = DLAMCH( 'Safe minimum' )
@@ -248,8 +264,17 @@
 *
 *     Compute the estimate of the reciprocal condition number.
 *
-      IF( AINVNM.NE.ZERO )
-     $   RCOND = ( ONE / AINVNM ) / ANORM
+      IF( AINVNM.NE.ZERO ) THEN
+         RCOND = ( ONE / AINVNM ) / ANORM
+      ELSE
+         INFO = 1
+         RETURN
+      END IF
+*
+*     Check for NaNs and Infs
+*
+      IF( DISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL )
+     $   INFO = 1
 *
    20 CONTINUE
       RETURN
diff --git a/lapack-netlib/SRC/dgedmd.c b/lapack-netlib/SRC/dgedmd.c
new file mode 100644
index 000000000..66b4d5da6
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmd.c
@@ -0,0 +1,1753 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+static integer c__2 = 2;
+
+/* Subroutine */ int dgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, doublereal *x, integer *ldx, 
+	doublereal *y, integer *ldy, integer *nrnk, doublereal *tol, integer *
+	k, doublereal *reig, doublereal *imeig, doublereal *z__, integer *ldz,
+	 doublereal *res, doublereal *b, integer *ldb, doublereal *w, integer 
+	*ldw, doublereal *s, integer *lds, doublereal *work, integer *lwork, 
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    doublereal zero, ssum;
+    integer info1, info2;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    doublereal xscl1, xscl2;
+    integer i__, j;
+    extern /* Subroutine */ int dscal_(integer *, doublereal *, doublereal *, 
+	    integer *);
+    doublereal scale;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *),
+	     dgeev_(char *, char *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     integer *, doublereal *, integer *, integer *);
+    extern logical lsame_(char *, char *);
+    logical badxy;
+    doublereal small;
+    char jobzl[1];
+    extern /* Subroutine */ int daxpy_(integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, integer *);
+    logical wntex;
+    doublereal ab[4]	/* was [2][2] */;
+    extern doublereal dlamch_(char *), dlange_(char *, integer *, 
+	    integer *, doublereal *, integer *, doublereal *);
+    extern /* Subroutine */ int dgesdd_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *, 
+	    integer *), dlascl_(char *, integer *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
+	    integer *, integer *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int dgesvd_(char *, char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *), dlacpy_(char *, integer *, integer *, doublereal 
+	    *, integer *, doublereal *, integer *), xerbla_(char *, 
+	    integer *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int dgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *, doublereal *, integer *, 
+	    doublereal *, integer *, integer *, integer *), dlassq_(integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *);
+    logical sccolx, sccoly;
+    integer lwrsdd, mwrsdd, iminwr;
+    logical wntref, wntvec;
+    doublereal rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork;
+    doublereal rdummy[2];
+    integer lwrsvd, mwrsvd;
+    logical lquery, wntres;
+    char jsvopt[1];
+    extern /* Subroutine */ int mecago_();
+    integer mwrsvj, lwrsvq, mwrsvq;
+    doublereal rdummy2[2], ofl, one;
+    extern /* Subroutine */ int dgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, doublereal *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, integer *, integer *, 
+	    integer *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, integer *);
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     DGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, DGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, DGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ............................................................ */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: DGESVD (the QR SVD algorithm) */
+/*     2 :: DGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*     < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, Y contains the data snapshot matrix Y */
+/*     < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1). */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, and Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of IMEIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consecutive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, and Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*        || Z(:,i:i+1)||_F = 1. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of X(:,1:K)*W(1:K,1:K), where the columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */
+/*     are similarly structured: If IMEIG(i) == 0 then */
+/*     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */
+/*     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */
+/*          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of REIG, IMEIG and Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient (real and */
+/*     imaginary parts for each complex conjugate pair of the */
+/*     eigenvalues). The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     right singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by DGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, WORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */
+/*     scaling factor WORK(N+2)/WORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to DGEDMD is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     leng of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     If WHTSVD == 1 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */
+/*        If JOBZ == 'N'  then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */
+/*        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */
+/*        workspace length of DGESVD. */
+/*     If WHTSVD == 2 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */
+/*        minimal workspace length of DGESDD. */
+/*     If WHTSVD == 3 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = N+M+MAX(3*N+1, */
+/*                        MAX(1,3*N+M,5*N),MAX(1,N)) */
+/*        is the minimal workspace length of DGESVDQ. */
+/*     If WHTSVD == 4 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */
+/*        minimal workspace length of DGEJSV. */
+/*     The above expressions are not simplified in order to */
+/*     make the usage of WORK more transparent, and for */
+/*     easier checking. In any case, LWORK >= 2. */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -18;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -21;
+    } else if (*ldw < *n) {
+	*info = -23;
+    } else if (*lds < *n) {
+	*info = -25;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.;
+		work[2] = 2.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwork = f2cmax(2,*n);
+	olwork = f2cmax(2,*n);
+	iminwr = 1;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of DGESVD: */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1,
+		    i__2), i__2 = f2cmin(*m,*n) * 5;
+	    mwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvd;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsvd, i__2 = (integer) rdummy[0];
+		lwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of DGESDD: */
+/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */
+/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */
+/* IMINWR = 8*MIN(M,N) */
+/* Computing MAX */
+	    i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,*
+		    n) << 2);
+	    mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsdd;
+	    mlwork = f2cmax(i__1,i__2);
+	    iminwr = f2cmin(*m,*n) << 3;
+	    if (lquery) {
+		dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+			ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) rdummy[0];
+		lwrsdd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsdd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 3) {
+/* LWQP3 = 3*N+1 */
+/* LWORQ = MAX(N, 1) */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2) */
+/* MLWORK = N +  MWRSVQ */
+/* IMINWR = M+N-1 */
+	    dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], liwork, rdummy, &c_n1, rdummy2, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) rdummy[1];
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0];
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+/* Computing MAX */
+		i__1 = mwrsvq, i__2 = (integer) rdummy[0];
+		lwrsvq = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0];
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V' */
+/* Computing MAX */
+	    i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (*
+		    n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1)
+		     + *n * *n + 6;
+	    mwrsvj = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvj;
+	    mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = 3, i__2 = *m + *n * 3;
+	    iminwr = f2cmax(i__1,i__2);
+	    if (lquery) {
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + mwrsvj;
+		olwork = f2cmax(i__1,i__2);
+	    }
+/*         END SELECT */
+	}
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the DGEEV call */
+	if (lsame_(jobzl, "V")) {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n << 2;
+	    mwrkev = f2cmax(i__1,i__2);
+	} else {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n * 3;
+	    mwrkev = f2cmax(i__1,i__2);
+	}
+/* Computing MAX */
+	i__1 = mlwork, i__2 = *n + mwrkev;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    dgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[
+		    w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1);
+/* Computing MAX */
+	    i__1 = mwrkev, i__2 = (integer) rdummy[0];
+	    lwrkev = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = olwork, i__2 = *n + lwrkev;
+	    olwork = f2cmax(i__1,i__2);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -29;
+	}
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -27;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (doublereal) mlwork;
+	work[2] = (doublereal) olwork;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = dlamch_("O");
+    small = dlamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using DLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    dlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("DGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+/*                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("DGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (work[i__] > zero) {
+		d__1 = one / work[i__];
+		dscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		d__1 = -work[i__];
+		d__2 = one / (doublereal) (*m);
+		dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], m, &info2);
+/* LAPACK CAL */
+	    } else if (y[idamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * 
+		    y_dim1] != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    dscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using DLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    dlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("DGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    dlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    dlascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+/*                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (work[i__] > zero) {
+		d__1 = one / work[i__];
+		dscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		d__1 = -work[i__];
+		d__2 = one / (doublereal) (*m);
+		dlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], m, &info2);
+/* LAPACK CAL */
+	    } else if (x[idamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * 
+		    x_dim1] != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	i__1 = *lwork - *n;
+	dgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 2) {
+	i__1 = *lwork - *n;
+	dgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[
+		w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 3) {
+	i__1 = *lwork - *n - f2cmax(2,*m);
+	i__2 = f2cmax(2,*m);
+	dgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], &
+		z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], &
+		i__2, &info1);
+/* L */
+	dlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 4) {
+	i__1 = *lwork - *n;
+	dgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, &
+		work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1]
+		, &i__1, &iwork[1], &info1);
+/* LAPACK CALL */
+	dlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = work[*n + 1];
+	xscl2 = work[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case DGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    dlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (work[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("DGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= work[1] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^T * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^T is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d__1 = one / work[i__];
+	    dscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that DGESVD, DGESVDQ and DGESDD return the */
+/* transposed matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[*n + i__] = one / work[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1];
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside DGEDMD). */
+	dgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], 
+		ldw, &zero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRI */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRI */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLAS */
+	dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	dgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* BLAS */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	dgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, 
+		&zero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! IN */
+/* In the two DGEMM calls here, can use K for LDZ. */
+/* B */
+	dgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset]
+		, ldw, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRIN */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^T is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		dlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		dlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    i__1 = *lwork - *n;
+    dgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset]
+	    , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient. Even in the case of complex spectrum, all */
+/* computation is done in real arithmetic. REIG and */
+/* IMEIG are the real and the imaginary parts of the */
+/* eigenvalues, so that the spectrum is given as */
+/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */
+/* are listed at consecutive positions. For such a */
+/* complex conjugate pair of the eigenvalues, the */
+/* corresponding eigenvectors are also a complex */
+/* conjugate pair with the real and imaginary parts */
+/* stored column-wise in W at the corresponding */
+/* consecutive column indices. See the description of Z. */
+/* Also, see the description of DGEEV. */
+/* LAPACK C */
+    if (info1 > 0) {
+/* DGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		dgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+		dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[
+			s_offset], lds, &zero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		dlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    dlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    dgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    dgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */
+/*           LDS, ZERO, Z, LDZ) */
+/* Save a copy of Z into B and free Z for holding */
+/* the Ritz vectors. */
+/* CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the real form of the Ritz vectors */
+	if (wntvec) {
+	    dgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__ = 1;
+	    while(i__ <= *k) {
+		if (imeig[i__] == zero) {
+/* have a real eigenvalue with real eigenvector */
+		    d__1 = -reig[i__];
+		    daxpy_(m, &d__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			    y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! */
+
+		    res[i__] = dnrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+
+		    ++i__;
+		} else {
+/* Have a complex conjugate pair */
+/* REIG(i) +- sqrt(-1)*IMEIG(i). */
+/* Since all computation is done in real */
+/* arithmetic, the formula for the residual */
+/* is recast for real representation of the */
+/* complex conjugate eigenpair. See the */
+/* description of RES. */
+		    ab[0] = reig[i__];
+		    ab[1] = -imeig[i__];
+		    ab[2] = imeig[i__];
+		    ab[3] = reig[i__];
+		    d__1 = -one;
+		    dgemm_("N", "N", m, &c__2, &c__2, &d__1, &z__[i__ * 
+			    z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * 
+			    y_dim1 + 1], ldy);
+/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INT */
+/* BL */
+		    res[i__] = dlange_("F", m, &c__2, &y[i__ * y_dim1 + 1], 
+			    ldy, &work[*n + 1]);
+/* LA */
+		    res[i__ + 1] = res[i__];
+		    i__ += 2;
+		}
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	work[*n + 1] = xscl1;
+	work[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* dgedmd_ */
+
diff --git a/lapack-netlib/SRC/dgedmd.f90 b/lapack-netlib/SRC/dgedmd.f90
new file mode 100644
index 000000000..15df48fe9
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmd.f90
@@ -0,0 +1,1206 @@
+!> \brief \b DGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!     SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+!                        M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+!                        K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+!                        B, LDB, W,  LDW,   S, LDS,        &
+!                        WORK, LWORK, IWORK, LIWORK, INFO )
+!
+!.....
+!     USE                   iso_fortran_env
+!     IMPLICIT NONE
+!     INTEGER, PARAMETER :: WP = real64
+!.....
+!     Scalar arguments
+!     CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!     INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                NRNK, LDZ, LDB, LDW,  LDS, &
+!                                LWORK,  LIWORK
+!     INTEGER,   INTENT(OUT)  :: K, INFO
+!     REAL(KIND=WP), INTENT(IN)  :: TOL
+!     Array arguments
+!     REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                     W(LDW,*), S(LDS,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+!                                     RES(*)
+!     REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+!     INTEGER,       INTENT(OUT)   :: IWORK(*)
+!
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    DGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, DGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, DGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
+!......................................................................
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!......................................................................
+!     Arguments
+!     =========
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) is CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
+!.....
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: DGESVD (the QR SVD algorithm)
+!>    2 :: DGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
+!.....
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
+!.....
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
+!.....
+!>    \param[in,out] X
+!>    \verbatim
+!>    X (input/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDX
+!>    \verbatim
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
+!.....
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
+!.....
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
+!.....
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1).
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
+!.....
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
+!.....
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
+!.....
+!>    \param[out] REIG
+!>    \verbatim
+!>    REIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of REIG contain
+!>    the real parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    See the descriptions of K, IMEIG, and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] IMEIG
+!>    \verbatim
+!>    IMEIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of IMEIG contain
+!>    the imaginary parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    The eigenvalues are determined as follows:
+!>    If IMEIG(i) == 0, then the corresponding eigenvalue is
+!>    real, LAMBDA(i) = REIG(i).
+!>    If IMEIG(i)>0, then the corresponding complex
+!>    conjugate pair of eigenvalues reads
+!>    LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!>    LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!>    That is, complex conjugate pairs have consecutive
+!>    indices (i,i+1), with the positive imaginary part
+!>    listed first.
+!>    See the descriptions of K, REIG, and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then
+!>       Z contains real Ritz vectors as follows:
+!>       If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!>       the i-th Ritz value; ||Z(:,i)||_2=1.
+!>       If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!>       [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!>       the Ritz values extracted from this subspace are
+!>       REIG(i) + sqrt(-1)*IMEIG(i) and
+!>       REIG(i) - sqrt(-1)*IMEIG(i).
+!>       The corresponding eigenvectors are
+!>       Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!>       Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!>       || Z(:,i:i+1)||_F = 1.
+!>    If JOBZ == 'F', then the above descriptions hold for
+!>    the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!>    are similarly structured: If IMEIG(i) == 0 then
+!>    X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!>    then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!>         X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!>    are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!>    See the descriptions of REIG, IMEIG, X and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
+!.....
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs.
+!>    If LAMBDA(i) is real, then
+!>       RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!>    If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!>    then
+!>    RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!>    where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!>              [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!>    It holds that
+!>    RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!>    RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!>    where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!>          ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!>    See the description of REIG, IMEIG and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) REAL(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1;K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
+!.....
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) REAL(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient (real and
+!>    imaginary parts for each complex conjugate pair of the
+!>    eigenvalues). The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    right singular vectors of X.
+!>    \endverbatim
+!.....
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
+!.....
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) REAL(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by DGEEV.
+!>    See the description of K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
+!.....
+!>    \param[out] WORK
+!>    \verbatim
+!>    WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!>    On exit, WORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!>    scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to DGEDMD is only workspace query, then
+!>    WORK(1) contains the minimal workspace length and
+!>    WORK(2) is the optimal workspace length. Hence, the
+!>    leng of work is at least 2.
+!>    See the description of LWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LWORK
+!>    \verbatim
+!>    LWORK (input) INTEGER
+!>    The minimal length of the workspace vector WORK.
+!>    LWORK is calculated as follows:
+!>    If WHTSVD == 1 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!>       If JOBZ == 'N'  then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!>       Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!>       workspace length of DGESVD.
+!>    If WHTSVD == 2 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!>       minimal workspace length of DGESDD.
+!>    If WHTSVD == 3 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = N+M+MAX(3*N+1,
+!>                       MAX(1,3*N+M,5*N),MAX(1,N))
+!>       is the minimal workspace length of DGESVDQ.
+!>    If WHTSVD == 4 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!>       minimal workspace length of DGEJSV.
+!>    The above expressions are not simplified in order to
+!>    make the usage of WORK more transparent, and for
+!>    easier checking. In any case, LWORK >= 2.
+!>    If on entry LWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
+!.....
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
+!.....
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
+!.............................................................
+!.............................................................
+      SUBROUTINE DGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)  :: TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,    ROOTSC, SCALE,  SMALL,  &
+                       SSUM,   XSCL1,  XSCL2
+      INTEGER       :: i,   j, IMINWR,  INFO1, INFO2,  &
+                       LWRKEV, LWRSDD, LWRSVD,         &
+                       LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                       MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                       OLWORK
+      LOGICAL       :: BADXY,  LQUERY, SCCOLX, SCCOLY, &
+                       WNTEX,  WNTREF, WNTRES, WNTVEC
+      CHARACTER     :: JOBZL,  T_OR_N
+      CHARACTER     :: JSVOPT
+!
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
+!
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) DLANGE, DLAMCH, DNRM2
+      EXTERNAL      DLANGE, DLAMCH, DNRM2, IDAMAX
+      INTEGER       IDAMAX
+      LOGICAL       DISNAN, LSAME
+      EXTERNAL      DISNAN, LSAME
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      DAXPY,  DGEMM,  DSCAL
+      EXTERNAL      DGEEV,  DGEJSV, DGESDD, DGESVD, DGESVDQ, &
+                    DLACPY, DLASCL, DLASSQ, XERBLA
+!
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     DBLE, INT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -18
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -21
+      ELSE IF ( LDW < N ) THEN
+          INFO = -23
+      ELSE IF ( LDS < N ) THEN
+          INFO = -25
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                WORK(1)  = 2
+                WORK(2)  = 2
+            ELSE
+               K = 0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+         MLWORK = MAX(2,N)
+         OLWORK = MAX(2,N)
+         IMINWR = 1
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of DGESVD:
+             ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MLWORK = MAX(MLWORK,N + MWRSVD)
+             IF ( LQUERY ) THEN
+                CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, &
+                           B, LDB, W, LDW, RDUMMY, -1, INFO1 )
+                LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of DGESDD:
+             ! MWRSDD = 3*MIN(M,N)*MIN(M,N) +
+             ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 3*MIN(M,N)*MIN(M,N) +                &
+              MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             MLWORK = MAX(MLWORK,N + MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             IF ( LQUERY ) THEN
+                CALL DGESDD( 'O', M, N, X, LDX, WORK, B,     &
+                     LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 )
+                LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSDD)
+             END IF
+         CASE (3)
+             !LWQP3 = 3*N+1
+             !LWORQ = MAX(N, 1)
+             !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ ) + MAX(M,2)
+             !MLWORK = N +  MWRSVQ
+             !IMINWR = M+N-1
+             CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                             X, LDX, WORK, Z, LDZ, W, LDW,   &
+                             NUMRNK, IWORK, LIWORK, RDUMMY,  &
+                             -1, RDUMMY2, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(RDUMMY(2))
+             MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = MAX( MWRSVQ, INT(RDUMMY(1)) )
+                OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1)))
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N ) ! for JSVOPT='V'
+             MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 )
+             MLWORK = MAX(MLWORK,N+MWRSVJ)
+             IMINWR = MAX( 3, M+3*N )
+             IF ( LQUERY ) THEN
+                OLWORK =  MAX(OLWORK,N+MWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the DGEEV call
+         IF ( LSAME(JOBZL,'V') ) THEN
+             MWRKEV = MAX( 1, 4*N )
+         ELSE
+             MWRKEV = MAX( 1, 3*N )
+         END IF
+         MLWORK = MAX(MLWORK,N+MWRKEV)
+         IF ( LQUERY ) THEN
+                CALL DGEEV( 'N', JOBZL, N, S, LDS, REIG, &
+                    IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 )
+                LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) )
+                OLWORK = MAX( OLWORK, N+LWRKEV )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'DGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = DLAMCH('O')
+      SMALL = DLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using DLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, X(1,i), 1 )
+            SSUM  = ONE
+            SCALE = ZERO
+            CALL DLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('DGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               X(1,i), M, INFO2 )              ! LAPACK CALL
+!                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+          ! All columns of X are zero. Return error code -8.
+          ! (the 8th input variable had an illegal value)
+          K = 0
+          INFO = -8
+          CALL XERBLA('DGEDMD',-INFO)
+          RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL DSCAL( M, ONE/WORK(i), Y(1,i), 1 )  ! BLAS CALL
+!               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL DLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/DBLE(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( Y(IDAMAX(M, Y(1,i),1),i )  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL DSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using DLASSQ.
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, Y(1,i), 1 )
+            SSUM  = ONE
+            SCALE = ZERO
+            CALL DLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('DGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL DLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL DLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               Y(1,i), M, INFO2 )              ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL DSCAL( M, ONE/WORK(i), X(1,i), 1 )  ! BLAS CALL
+!               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL DLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/DBLE(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( X(IDAMAX(M, X(1,i),1),i )  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL DGESVD( 'O', 'S', M, N, X, LDX, WORK, B, &
+                  LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL
+             T_OR_N = 'T'
+         CASE (2)
+            CALL DGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, &
+                 LDW, WORK(N+1), LWORK-N, IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'T'
+         CASE (3)
+              CALL DGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, WORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),&
+                   LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1)     ! LAPACK CALL
+              CALL DLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'T'
+         CASE (4)
+              CALL DGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, WORK, Z, LDZ, W, LDW, &
+                   WORK(N+1), LWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL DLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = WORK(N+1)
+              XSCL2 = WORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case DGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL DLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( WORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('DGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( WORK(i+1) <= WORK(i)*TOL  ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( WORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^T * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^T is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL DSCAL( N, ONE/WORK(i), W(1,i), 1 )    ! BLAS CALL
+           ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that DGESVD, DGESVDQ and DGESDD return the
+          ! transposed matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL DSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              WORK(N+i) = ONE/WORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = (WORK(N+i))*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside DGEDMD).
+          CALL DGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, &
+                      LDW, ZERO, Z, LDZ )                        ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRINSIC, for T_OR_N=='T'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL DGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, &
+                      LDZ, ZERO, S, LDS )                        ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^T * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL DGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, &
+                   ZERO, Z, LDZ )                                   ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! INTRINSIC
+        ! In the two DGEMM calls here, can use K for LDZ.
+        CALL DGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, &
+                    LDW, ZERO, S, LDS )                         ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^T * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^T is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL DLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL DLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL DGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, &
+                  LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 )   ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient. Even in the case of complex spectrum, all
+      ! computation is done in real arithmetic. REIG and
+      ! IMEIG are the real and the imaginary parts of the
+      ! eigenvalues, so that the spectrum is given as
+      ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs
+      ! are listed at consecutive positions. For such a
+      ! complex conjugate pair of the eigenvalues, the
+      ! corresponding eigenvectors are also a complex
+      ! conjugate pair with the real and imaginary parts
+      ! stored column-wise in W at the corresponding
+      ! consecutive column indices. See the description of Z.
+      ! Also, see the description of DGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! DGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+      IF ( WNTRES ) THEN
+          IF ( WNTREF ) THEN
+            ! Here, if the refinement is requested, we have
+            ! A*U(:,1:K) already computed and stored in Z.
+            ! For the residuals, need Y = A * U(:,1;K) * W.
+            CALL DGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, &
+                       LDW, ZERO, Y, LDY )               ! BLAS CALL
+            ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC
+            ! This frees Z; Y contains A * U(:,1:K) * W.
+          ELSE
+            ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS)
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, Z, LDZ)
+            ! Save a copy of Z into Y and free Z for holding
+            ! the Ritz vectors.
+            CALL DLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+            IF ( WNTEX ) CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+      ELSE IF ( WNTEX ) THEN
+          ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL DGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, B, LDB )
+            ! The above call replaces the following two calls
+            ! that were used in the developing-testing phase.
+            ! CALL DGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+            !           LDS, ZERO, Z, LDZ)
+            ! Save a copy of Z into B and free Z for holding
+            ! the Ritz vectors.
+            ! CALL DLACPY( 'A', M, K, Z, LDZ, B, LDB )
+      END IF
+!
+      ! Compute the real form of the Ritz vectors
+      IF ( WNTVEC ) CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                   ZERO, Z, LDZ )                           ! BLAS CALL
+      ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+      IF ( WNTRES ) THEN
+         i = 1
+         DO WHILE ( i <= K )
+            IF ( IMEIG(i) == ZERO ) THEN
+                ! have a real eigenvalue with real eigenvector
+                CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+                ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! INTRINSIC
+                RES(i) = DNRM2( M, Y(1,i), 1)                         ! BLAS CALL
+                i = i + 1
+            ELSE
+               ! Have a complex conjugate pair
+               ! REIG(i) +- sqrt(-1)*IMEIG(i).
+               ! Since all computation is done in real
+               ! arithmetic, the formula for the residual
+               ! is recast for real representation of the
+               ! complex conjugate eigenpair. See the
+               ! description of RES.
+               AB(1,1) =  REIG(i)
+               AB(2,1) = -IMEIG(i)
+               AB(1,2) =  IMEIG(i)
+               AB(2,2) =  REIG(i)
+               CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                           LDZ, AB, 2, ONE, Y(1,i), LDY )          ! BLAS CALL
+               ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+               RES(i)   = DLANGE( 'F', M, 2, Y(1,i), LDY, &
+                                  WORK(N+1) )                      ! LAPACK CALL
+               RES(i+1) = RES(i)
+               i = i + 2
+            END IF
+         END DO
+      END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          WORK(N+1) = XSCL1
+          WORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE DGEDMD
diff --git a/lapack-netlib/SRC/dgedmdq.c b/lapack-netlib/SRC/dgedmdq.c
new file mode 100644
index 000000000..a743a3156
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmdq.c
@@ -0,0 +1,1300 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int dgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, 
+	doublereal *f, integer *ldf, doublereal *x, integer *ldx, doublereal *
+	y, integer *ldy, integer *nrnk, doublereal *tol, integer *k, 
+	doublereal *reig, doublereal *imeig, doublereal *z__, integer *ldz, 
+	doublereal *res, doublereal *b, integer *ldb, doublereal *v, integer *
+	ldv, doublereal *s, integer *lds, doublereal *work, integer *lwork, 
+	integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    doublereal zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    extern /* Subroutine */ int dgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, integer *, integer *, doublereal *, integer *, 
+	    doublereal *, doublereal *, doublereal *, integer *, doublereal *,
+	     doublereal *, integer *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *, integer *, integer *, integer 
+	    *), dgeqrf_(integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    integer *), dlacpy_(char *, integer *, integer *, doublereal *, 
+	    integer *, doublereal *, integer *), dlaset_(char *, 
+	    integer *, integer *, doublereal *, doublereal *, doublereal *, 
+	    integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int dorgqr_(integer *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    integer *), dormqr_(char *, char *, integer *, integer *, integer 
+	    *, doublereal *, integer *, doublereal *, doublereal *, integer *,
+	     doublereal *, integer *, integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork;
+    doublereal rdummy[2];
+    integer mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    doublereal one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     DGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, DGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, DGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretized operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the initial QR factorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     orthogonal matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: DGESVD (the QR SVD algorithm) */
+/*     2 :: DGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by DGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in WORK(1:N). */
+/*     See the description of WORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K)of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<N) entries of REIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consequtive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of Z*V, where the columns of V are the */
+/*     eigenvectors of the K-by-K Rayleigh quotient, and Z is */
+/*     orthonormal. The columns of V are similarly structured: */
+/*     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */
+/*     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */
+/*                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) contains the K eigenvectors of */
+/*     the Rayleigh quotient. The eigenvectors of a complex */
+/*     conjugate pair of eigenvalues are returned in real form */
+/*     as explained in the description of Z. The Ritz vectors */
+/*     (returned in Z) are the product of X and V; see */
+/*     the descriptions of X and Z. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by DGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by DGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to DGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for DGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for DGEDMD (see the */
+/*                  description of LWORK in DGEDMD) for */
+/*                  snapshots of dimensions MIN(M,N)-by-(N-1) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    DORMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for DORGQR[M,N,N]) */
+/*     Then */
+/*     LWORK = MAX(N+MLWQR, N+MLWDMD) */
+/*     is updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR ) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR) */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local array */
+/*     ~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* .......................................................... */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    zero = 0.f;
+    one = 1.f;
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -22;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -25;
+    } else if (*ldv < *n - 1) {
+	*info = -27;
+    } else if (*lds < *n - 1) {
+	*info = -29;
+    }
+
+    if (wntvec || wntvcf || wntvcq) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.;
+		work[2] = 2.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for DGEQRF. */
+	mlwork = minmn + mlwqr;
+	if (lquery) {
+	    dgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1);
+	    olwqr = (integer) rdummy[0];
+	    olwork = f2cmin(*m,*n) + olwqr;
+	}
+	i__1 = *n - 1;
+	dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &
+		z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], 
+		ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, &
+		info1);
+	mlwdmd = (integer) work[1];
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+	iminwr = iwork[1];
+	if (lquery) {
+	    olwdmd = (integer) work[2];
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		dormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], &
+			z__[z_offset], ldz, &work[1], &c_n1, &info1);
+		olwmqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = *n;
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[
+			1], &c_n1, &info1);
+		olwgqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	iminwr = f2cmax(1,iminwr);
+	mlwork = f2cmax(2,mlwork);
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -31;
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -33;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (doublereal) mlwork;
+	work[2] = (doublereal) olwork;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >>N , at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lwork - minmn;
+    dgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    dlaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    dlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    dlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	dlaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lwork - minmn;
+    dgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[
+	    z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+	    s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. See DGEDMD for details. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    dlaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by DGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by DGEDMD. */
+	dlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    dlaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	dormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor R in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to DGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	dlaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy);
+	dlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/orthogonal factor Q in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lwork - (minmn + *n - 1);
+	dorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + 
+		*n], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* dgedmdq_ */
+
diff --git a/lapack-netlib/SRC/dgedmdq.f90 b/lapack-netlib/SRC/dgedmdq.f90
new file mode 100644
index 000000000..bedfba472
--- /dev/null
+++ b/lapack-netlib/SRC/dgedmdq.f90
@@ -0,0 +1,704 @@
+SUBROUTINE DGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  REIG, IMEIG,  &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64      
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LWORK,  LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,    K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      REAL(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*),  &
+                                      Z(LDZ,*), B(LDB,*),  &
+                                      V(LDV,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*),  &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     DGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, DGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, DGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022  
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretized operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z.  
+!            Q is from the initial QR factorization.      
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     orthogonal matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned.    
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.
+!     To be useful on exit, this option needs JOBQ='Q'.
+!.....      
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: DGESVD (the QR SVD algorithm)
+!     2 :: DGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: DGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: DGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F).
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by DGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in WORK(1:N). 
+!     See the description of WORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K)of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X. 
+!.....
+!     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y.   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.  
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.       
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     REIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<N) entries of REIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consequtive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, Z.     
+!.....      
+!     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of Z*V, where the columns of V are the
+!     eigenvectors of the K-by-K Rayleigh quotient, and Z is
+!     orthonormal. The columns of V are similarly structured:
+!     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if 
+!     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and
+!                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and V.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs.       
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of Z.
+!.....
+!     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization.
+!     Here A denotes a compression of the underlying operator.
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....      
+!     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) contains the K eigenvectors of
+!     the Rayleigh quotient. The eigenvectors of a complex
+!     conjugate pair of eigenvalues are returned in real form
+!     as explained in the description of Z. The Ritz vectors
+!     (returned in Z) are the product of X and V; see
+!     the descriptions of X and Z.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by DGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     WORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by DGEQRF of the 
+!     M-by-N input matrix F.
+!     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to DGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for DGEQRF[M,N])
+!         MLWDMD = minimal workspace for DGEDMD (see the
+!                  description of LWORK in DGEDMD) for 
+!                  snapshots of dimensions MIN(M,N)-by-(N-1)
+!         MLWMQR = N (minimal workspace for 
+!                    DORMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for DORGQR[M,N,N])
+!     Then
+!     LWORK = MAX(N+MLWQR, N+MLWDMD)
+!     is updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWMQR )
+!        if   JOBQ == 'Q' THEN
+!             LWORK = MAX( LWORK, MIN(M,N)+N-1+MLWGQR)
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then    
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MLWDMD, MLWGQR, &
+                           MLWMQR, MLWORK, MLWQR,  MINMN,  & 
+                           OLWDMD, OLWGQR, OLWMQR, OLWORK, &
+                           OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     Local array      
+!     ~~~~~~~~~~~      
+      REAL(KIND=WP) :: RDUMMY(2)
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      DGEMM 
+      EXTERNAL      DGEQRF, DLACPY, DLASET, DORGQR, & 
+                    DORMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      DGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q')
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR.                &
+                                  LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ    &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+          INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -22
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -25
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -27
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -29
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+                WORK(1) = 2
+                WORK(2) = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF     
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for DGEQRF.
+         MLWORK = MINMN + MLWQR 
+         IF ( LQUERY ) THEN 
+             CALL DGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, &
+                          INFO1 )
+             OLWQR  = INT(RDUMMY(1))
+             OLWORK = MIN(M,N) + OLWQR           
+         END IF
+         CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      REIG, IMEIG, Z, LDZ, RES,  B, LDB,     & 
+                      V, LDV, S, LDS, WORK, -1, IWORK,       &
+                      LIWORK, INFO1 )
+         MLWDMD = INT(WORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         IMINWR = IWORK(1)
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(WORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL DORMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            WORK, Z, LDZ, WORK, -1, INFO1 )
+               OLWMQR = INT(WORK(1))
+               OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = N
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+                             WORK, -1, INFO1 )        
+                OLWGQR = INT(WORK(1))
+                OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR)
+            END IF            
+         END IF   
+         IMINWR = MAX( 1, IMINWR )
+         MLWORK = MAX( 2, MLWORK )      
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'DGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >>N , at this place 
+!     one can use an out of core QRF.
+!   
+      CALL DGEQRF( M, N, F, LDF, WORK,               & 
+                   WORK(MINMN+1), LWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL DLASET( 'L', MINMN, N-1, ZERO,  ZERO, X, LDX )
+      CALL DLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL DLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL DLASET( 'L', MINMN-2, N-2, ZERO,  ZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL DGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,  &
+                   N-1, X, LDX, Y, LDY, NRNK,   TOL, K,     &
+                   REIG, IMEIG, Z, LDZ, RES, B, LDB, V,     &
+                   LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, & 
+                   IWORK, LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code. See DGEDMD for details.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL DLASET( 'A', M-MINMN, K, ZERO, &
+                                     ZERO, Z(MINMN+1,1), LDZ )
+        CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+             LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by DGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by DGEDMD.
+        CALL DLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL DLASET( 'A', M-N, K, ZERO, ZERO,   & 
+                                  Z(N+1,1), LDZ )
+        CALL DORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+             LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor R in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to DGEDMDQ is to be 
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL DLASET( 'A', MINMN, N, ZERO,  ZERO, Y, LDY )
+         CALL DLACPY( 'U', MINMN, N, F, LDF,      Y, LDY )
+      END IF    
+!
+!     The orthonormal/orthogonal factor Q in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN               ! Q overwrites F 
+         CALL DORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+              WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE DGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/dgehrd.f b/lapack-netlib/SRC/dgehrd.f
index a40c61cb6..d95bbd182 100644
--- a/lapack-netlib/SRC/dgehrd.f
+++ b/lapack-netlib/SRC/dgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -173,7 +173,7 @@
       INTEGER            IHI, ILO, INFO, LDA, LWORK, N
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION  A( LDA, * ), TAU( * ), WORK( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -182,7 +182,7 @@
       INTEGER            NBMAX, LDT, TSIZE
       PARAMETER          ( NBMAX = 64, LDT = NBMAX+1,
      $                     TSIZE = LDT*NBMAX )
-      DOUBLE PRECISION  ZERO, ONE
+      DOUBLE PRECISION   ZERO, ONE
       PARAMETER          ( ZERO = 0.0D+0,
      $                     ONE = 1.0D+0 )
 *     ..
@@ -190,7 +190,7 @@
       LOGICAL            LQUERY
       INTEGER            I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB,
      $                   NBMIN, NH, NX
-      DOUBLE PRECISION  EI
+      DOUBLE PRECISION   EI
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DAXPY, DGEHD2, DGEMM, DLAHR2, DLARFB, DTRMM,
@@ -221,12 +221,18 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *        Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         ENDIF
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -248,7 +254,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -268,7 +273,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
@@ -344,6 +349,7 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL DGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
+*
       WORK( 1 ) = LWKOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f
index 83d16c30e..1db85e9c2 100644
--- a/lapack-netlib/SRC/dgejsv.f
+++ b/lapack-netlib/SRC/dgejsv.f
@@ -253,7 +253,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^t. In that case, [U] is computed
 *>                         in V as right singular vectors of A^t and then
 *>                         copied back to the U array. This 'W' option is just
@@ -362,7 +362,7 @@
 *>
 *> \param[out] IWORK
 *> \verbatim
-*>          IWORK is INTEGER array, dimension (M+3*N).
+*>          IWORK is INTEGER array, dimension (MAX(3,M+3*N)).
 *>          On exit,
 *>          IWORK(1) = the numerical rank determined after the initial
 *>                     QR factorization with pivoting. See the descriptions
@@ -1386,7 +1386,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1409,7 +1409,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1454,7 +1454,7 @@
 * :)           .. the input matrix A is very likely a relative of
 *              the Kahan matrix :)
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^T*V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^T*V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f
index 013b6c356..255e8732f 100644
--- a/lapack-netlib/SRC/dgelq.f
+++ b/lapack-netlib/SRC/dgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f
index ed3372f96..f0eb00a55 100644
--- a/lapack-netlib/SRC/dgelqf.f
+++ b/lapack-netlib/SRC/dgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -118,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -174,9 +175,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'DGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -184,19 +184,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
diff --git a/lapack-netlib/SRC/dgelqt3.f b/lapack-netlib/SRC/dgelqt3.f
index 5bcc06a80..ee3bdceb4 100644
--- a/lapack-netlib/SRC/dgelqt3.f
+++ b/lapack-netlib/SRC/dgelqt3.f
@@ -173,7 +173,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL DLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL DLARFG( N, A ( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1) )
 *
       ELSE
 *
diff --git a/lapack-netlib/SRC/dgelsd.f b/lapack-netlib/SRC/dgelsd.f
index b3b3d8b2d..7dc564f48 100644
--- a/lapack-netlib/SRC/dgelsd.f
+++ b/lapack-netlib/SRC/dgelsd.f
@@ -59,12 +59,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -194,7 +188,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup gelsd
 *
 *> \par Contributors:
 *  ==================
@@ -234,7 +228,7 @@
       DOUBLE PRECISION   ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEBRD, DGELQF, DGEQRF, DLABAD, DLACPY, DLALSD,
+      EXTERNAL           DGEBRD, DGELQF, DGEQRF, DLACPY, DLALSD,
      $                   DLASCL, DLASET, DORMBR, DORMLQ, DORMQR, XERBLA
 *     ..
 *     .. External Functions ..
@@ -282,7 +276,7 @@
      $       LOG( TWO ) ) + 1, 0 )
 *
       IF( INFO.EQ.0 ) THEN
-         MAXWRK = 0
+         MAXWRK = 1
          LIWORK = 3*MINMN*NLVL + 11*MINMN
          MM = M
          IF( M.GE.N .AND. M.GE.MNTHR ) THEN
@@ -378,7 +372,6 @@
       SFMIN = DLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max entry outside range [SMLNUM,BIGNUM].
 *
diff --git a/lapack-netlib/SRC/dgelss.f b/lapack-netlib/SRC/dgelss.f
index c4190f2e0..38449be7f 100644
--- a/lapack-netlib/SRC/dgelss.f
+++ b/lapack-netlib/SRC/dgelss.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE DGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -203,7 +203,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DBDSQR, DCOPY, DGEBRD, DGELQF, DGEMM, DGEMV,
-     $                   DGEQRF, DLABAD, DLACPY, DLASCL, DLASET, DORGBR,
+     $                   DGEQRF, DLACPY, DLASCL, DLASET, DORGBR,
      $                   DORMBR, DORMLQ, DORMQR, DRSCL, XERBLA
 *     ..
 *     .. External Functions ..
@@ -385,7 +385,6 @@
       SFMIN = DLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -529,7 +528,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL DLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL DGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL DCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -626,7 +625,7 @@
                CALL DLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL DGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, ZERO, WORK( IWORK ), 1 )
             CALL DCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -712,7 +711,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL DLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL DGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL DCOPY( N, WORK, 1, B, 1 )
          END IF
diff --git a/lapack-netlib/SRC/dgelsy.f b/lapack-netlib/SRC/dgelsy.f
index aebab9264..e9fcd9682 100644
--- a/lapack-netlib/SRC/dgelsy.f
+++ b/lapack-netlib/SRC/dgelsy.f
@@ -115,6 +115,7 @@
 *>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -147,6 +148,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f
index 3ba209105..757683f46 100644
--- a/lapack-netlib/SRC/dgemlq.f
+++ b/lapack-netlib/SRC/dgemlq.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -160,11 +161,13 @@
 *>  block sizes MB and NB returned by ILAENV, DGELQ will use either
 *>  DLASWLQ (if the matrix is wide-and-short) or DGELQT to compute
 *>  the LQ factorization.
-*>  This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to 
+*>  This version of DGEMLQ will use either DLAMSWLQ or DGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in DLAMSWLQ or DGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE DGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -186,7 +189,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -202,7 +205,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -217,6 +220,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -245,12 +255,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -262,7 +272,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -275,7 +285,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = LWMIN
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f
index 022cf21e4..608815483 100644
--- a/lapack-netlib/SRC/dgemqr.f
+++ b/lapack-netlib/SRC/dgemqr.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -160,12 +161,14 @@
 *>  block sizes MB and NB returned by ILAENV, DGEQR will use either
 *>  DLATSQR (if the matrix is tall-and-skinny) or DGEQRT to compute
 *>  the QR factorization.
-*>  This version of DGEMQR will use either DLAMTSQR or DGEMQRT to 
+*>  This version of DGEMQR will use either DLAMTSQR or DGEMQRT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in DLATMSQR or DGEMQRT.
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE DGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -203,7 +206,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +221,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -246,12 +256,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +273,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +286,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = LWMIN
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/dgeqlf.f b/lapack-netlib/SRC/dgeqlf.f
index b8ac0b1a0..a72d9dc76 100644
--- a/lapack-netlib/SRC/dgeqlf.f
+++ b/lapack-netlib/SRC/dgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -113,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -188,8 +189,9 @@
          END IF
          WORK( 1 ) = LWKOPT
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/dgeqp3rk.c b/lapack-netlib/SRC/dgeqp3rk.c
new file mode 100644
index 000000000..17a78dd5a
--- /dev/null
+++ b/lapack-netlib/SRC/dgeqp3rk.c
@@ -0,0 +1,1059 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int dgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, doublereal *abstol, doublereal *reltol, doublereal *a, integer 
+	*lda, integer *k, doublereal *maxc2nrmk, doublereal *relmaxc2nrmk, 
+	integer *jpiv, doublereal *tau, doublereal *work, integer *lwork, 
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    extern /* Subroutine */ int dlaqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, doublereal *, integer *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, doublereal *, doublereal *,
+	     doublereal *, integer *), dlaqp3rk_(integer *, integer *, 
+	    integer *, integer *, integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *, logical *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    doublereal *, doublereal *, doublereal *, doublereal *, integer *,
+	     integer *, integer *);
+    doublereal maxc2nrm;
+    logical done;
+    integer jmax;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    integer j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    doublereal myhugeval;
+    integer jb, nb, kf;
+    extern doublereal dlamch_(char *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    integer nx;
+    doublereal safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern logical disnan_(doublereal *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    doublereal eps;
+    integer iws, ioffset;
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (disnan_(abstol)) {
+	*info = -5;
+    } else if (disnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in DLAQP2RK. */
+/*           1) DGEQP3RK and DLAQP2RK: 2*N to store full and partial */
+/*              column 2-norms. */
+/*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in DLARF subroutine inside DLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n * 3 + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "DGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in DLAQP2RK and blocked BLAS 3 code */
+/*           in DLAQP3RK. */
+/*           1) DGEQP3RK, DLAQP2RK, DLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in DLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) DLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) DLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	work[1] = (doublereal) lwkopt;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("DGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	work[j] = dnrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	work[*n + j] = work[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = idamax_(n, &work[1], &c__1);
+    maxc2nrm = work[kp1];
+
+/*     ==================================================================. */
+
+    if (disnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.;
+	}
+
+	work[1] = (doublereal) lwkopt;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = dlamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.;
+	}
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = dlamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.) {
+	safmin = dlamch_("Safe minimum");
+/* Computing MAX */
+	d__1 = *abstol, d__2 = safmin * 2.;
+	*abstol = f2cmax(d__1,d__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1. <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.;
+	}
+
+	work[1] = (doublereal) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "DGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "DGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    dlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &
+		    work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb 
+		    + 1], &i__1, &iwork[1], &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		work[1] = (doublereal) lwkopt;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	dlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], &
+		work[(*n << 1) + 1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + idamax_(&i__1, &work[*k + 1], &c__1);
+	    *maxc2nrmk = work[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		tau[j] = 0.;
+	    }
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    work[1] = (doublereal) lwkopt;
+
+    return 0;
+
+/*     End of DGEQP3RK */
+
+} /* dgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/dgeqp3rk.f b/lapack-netlib/SRC/dgeqp3rk.f
new file mode 100644
index 000000000..b8e41b39c
--- /dev/null
+++ b/lapack-netlib/SRC/dgeqp3rk.f
@@ -0,0 +1,1082 @@
+*> \brief \b DGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       DOUBLE PRECISION   ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       DOUBLE PRECISION   A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a real
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a real M-by-NRHS
+*> matrix B with  Q(K)**T * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**T * B, the matrix B with the orthogonal
+*>        transformation Q(K)**T applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M orthogonal matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**T,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a real scalar,
+*>   v    is a real vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = DLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**T * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= (3*N+NRHS-1), otherwise.
+*>          For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )),
+*>          where NB is the optimal block size for DGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine DLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> DGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in DGEQP3 routine which uses
+*> DLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      DOUBLE PRECISION   ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      DOUBLE PRECISION   EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLAQP2RK, DLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX, ILAENV
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DISNAN, DLAMCH, DNRM2, IDAMAX, ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( DISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( DISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in DLAQP2RK.
+*           1) DGEQP3RK and DLAQP2RK: 2*N to store full and partial
+*              column 2-norms.
+*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in DLARF subroutine inside DLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = 3*N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'DGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in DLAQP2RK and blocked BLAS 3 code
+*           in DLAQP3RK.
+*           1) DGEQP3RK, DLAQP2RK, DLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) DLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in DLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) DLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) DLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = DBLE( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         WORK( J ) = DNRM2( M, A( 1, J ), 1 )
+         WORK( N+J ) = WORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = IDAMAX( N, WORK( 1 ), 1 )
+      MAXC2NRM = WORK( KP1 )
+*
+*     ==================================================================.
+*
+      IF( DISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = DLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = DLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = DBLE( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'DGEQP3RK', ' ', M, N, -1, -1 ))
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'DGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL DLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     WORK( J ), WORK( N+J ),
+     $                     WORK( 2*N+1 ), WORK( 2*N+JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = DBLE( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL DLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), WORK( J ), WORK( N+J ),
+     $                  WORK( 2*N+1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + IDAMAX( N-K, WORK( K+1 ), 1 )
+            MAXC2NRMK = WORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = ZERO
+            END DO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = DBLE( LWKOPT )
+*
+      RETURN
+*
+*     End of DGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f
index eac8930ce..6ed8f211f 100644
--- a/lapack-netlib/SRC/dgeqr.f
+++ b/lapack-netlib/SRC/dgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -244,8 +246,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +257,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +272,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +286,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = LWMIN
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = LWREQ
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +313,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = LWREQ
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f
index 46d2ee479..aa757e96c 100644
--- a/lapack-netlib/SRC/dgeqrfp.f
+++ b/lapack-netlib/SRC/dgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -122,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEQR2P, DLARFB, DLARFT, XERBLA
@@ -181,8 +182,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
       WORK( 1 ) = LWKOPT
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -190,7 +199,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -202,7 +211,6 @@
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -210,7 +218,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
diff --git a/lapack-netlib/SRC/dgerqf.f b/lapack-netlib/SRC/dgerqf.f
index cca9d6367..435239cc7 100644
--- a/lapack-netlib/SRC/dgerqf.f
+++ b/lapack-netlib/SRC/dgerqf.f
@@ -114,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gerqf
 *
 *> \par Further Details:
 *  =====================
@@ -189,7 +189,7 @@
          END IF
          WORK( 1 ) = LWKOPT
 *
-         IF ( .NOT.LQUERY ) THEN
+         IF( .NOT.LQUERY ) THEN
             IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
      $         INFO = -7
          END IF
diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f
index 738a122e8..87a4e702d 100644
--- a/lapack-netlib/SRC/dgesdd.f
+++ b/lapack-netlib/SRC/dgesdd.f
@@ -55,12 +55,6 @@
 *>
 *> Note that the routine returns VT = V**T, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dgesvdq.f b/lapack-netlib/SRC/dgesvdq.f
index 6f9ac703e..a514def8b 100644
--- a/lapack-netlib/SRC/dgesvdq.f
+++ b/lapack-netlib/SRC/dgesvdq.f
@@ -365,7 +365,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f
index 5fdb21e45..198bfb0a5 100644
--- a/lapack-netlib/SRC/dgesvj.f
+++ b/lapack-netlib/SRC/dgesvj.f
@@ -208,7 +208,7 @@
 *>
 *> \param[in,out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On entry :
 *>          If JOBU = 'C' :
 *>          WORK(1) = CTOL, where CTOL defines the threshold for convergence.
@@ -239,7 +239,12 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          length of WORK, WORK >= MAX(6,M+N)
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; WORK(1) is set to the minial (and optimal)
+*>          length of WORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -260,7 +265,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -365,9 +370,9 @@
       INTEGER            BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
      $                   ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
      $                   N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP,
-     $                   SWBAND
-      LOGICAL            APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK,
-     $                   RSVEC, UCTOL, UPPER
+     $                   SWBAND, MINMN, LWMIN
+      LOGICAL            APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE,
+     $                   ROTOK, RSVEC, UCTOL, UPPER
 *     ..
 *     .. Local Arrays ..
       DOUBLE PRECISION   FASTR( 5 )
@@ -408,6 +413,14 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 6, M+N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -427,7 +440,7 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
       ELSE
          INFO = 0
@@ -437,11 +450,14 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DGESVJ', -INFO )
          RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = LWMIN
+         RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/dgetri.f b/lapack-netlib/SRC/dgetri.f
index 92ef90c18..7b5a3a1b6 100644
--- a/lapack-netlib/SRC/dgetri.f
+++ b/lapack-netlib/SRC/dgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE DGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -151,8 +151,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
+      LWKOPT = MAX( 1, N*NB )
       WORK( 1 ) = LWKOPT
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f
index 25f4c12c2..73b505ff7 100644
--- a/lapack-netlib/SRC/dgetsls.f
+++ b/lapack-netlib/SRC/dgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE DGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -189,7 +189,7 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       DOUBLE PRECISION   DLAMCH, DLANGE
-      EXTERNAL           LSAME, DLABAD, DLAMCH, DLANGE
+      EXTERNAL           LSAME, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DGEQR, DGEMQR, DLASCL, DLASET,
@@ -226,7 +226,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEM = 1
+         WSIZEO = 1
+       ELSE IF( M.GE.N ) THEN
          CALL DGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
@@ -294,7 +297,6 @@
 *
        SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f
index 668deeba8..682c7c30f 100644
--- a/lapack-netlib/SRC/dgetsqrhrt.f
+++ b/lapack-netlib/SRC/dgetsqrhrt.f
@@ -130,14 +130,17 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -212,7 +215,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +228,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +266,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
@@ -346,4 +350,4 @@
 *
 *     End of DGETSQRHRT
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/dgges.f b/lapack-netlib/SRC/dgges.f
index 31db23715..b9ffc7982 100644
--- a/lapack-netlib/SRC/dgges.f
+++ b/lapack-netlib/SRC/dgges.f
@@ -234,8 +234,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If N = 0, LWORK >= 1, else LWORK >= 8*N+16.
-*>          For good performance , LWORK must generally be larger.
+*>          If N = 0, LWORK >= 1, else LWORK >= MAX(8*N,6*N+16).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEeigen
+*> \ingroup gges
 *
 *  =====================================================================
       SUBROUTINE DGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB,
@@ -321,9 +321,8 @@
       DOUBLE PRECISION   DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLABAD,
-     $                   DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN,
-     $                   XERBLA
+      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHRD, DHGEQZ, DLACPY,
+     $                   DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -431,7 +430,6 @@
       EPS = DLAMCH( 'P' )
       SAFMIN = DLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/dgges3.f b/lapack-netlib/SRC/dgges3.f
index 7b00d294a..2ef55951a 100644
--- a/lapack-netlib/SRC/dgges3.f
+++ b/lapack-netlib/SRC/dgges3.f
@@ -234,6 +234,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= 6*N+16.
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -273,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE DGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -309,7 +311,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, LST2SL, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT
+     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SAFMAX, SAFMIN, SMLNUM
 *     ..
@@ -318,9 +321,8 @@
       DOUBLE PRECISION   DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD,
-     $                   DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGSEN,
-     $                   XERBLA
+      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY,
+     $                   DLASCL, DLASET, DORGQR, DORMQR, DTGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -362,6 +364,12 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 6*N+16
+      END IF
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -378,7 +386,7 @@
          INFO = -15
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -17
-      ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -19
       END IF
 *
@@ -386,29 +394,33 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 6*N+16, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          IF( ILVSL ) THEN
             CALL DORGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          END IF
          CALL DGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL,
      $                LDVSL, VSR, LDVSR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          CALL DLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB,
      $                ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR,
      $                WORK, -1, 0, IERR )
-         LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          IF( WANTST ) THEN
             CALL DTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VSL, LDVSL, VSR, LDVSR,
      $                   SDIM, PVSL, PVSR, DIF, WORK, -1, IDUM, 1,
      $                   IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = LWKOPT
          END IF
-         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -430,7 +442,6 @@
       EPS = DLAMCH( 'P' )
       SAFMIN = DLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/dggev3.f b/lapack-netlib/SRC/dggev3.f
index 4bbe8a40f..b970c04c4 100644
--- a/lapack-netlib/SRC/dggev3.f
+++ b/lapack-netlib/SRC/dggev3.f
@@ -188,7 +188,9 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          LWORK is INTEGER
+*>          LWORK is INTEGER.
+*>          The dimension of the array WORK. LWORK >= MAX(1,8*N).
+*>          For good performance, LWORK should generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -217,7 +219,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE DGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR,
@@ -248,7 +250,8 @@
       LOGICAL            ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
-     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT
+     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT,
+     $                   LWKMIN
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
 *     ..
@@ -256,9 +259,8 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLABAD,
-     $                   DLACPY, DLASCL, DLASET, DORGQR, DORMQR, DTGEVC,
-     $                   XERBLA
+      EXTERNAL           DGEQRF, DGGBAK, DGGBAL, DGGHD3, DLAQZ0, DLACPY,
+     $                   DLASCL, DLASET, DORGQR, DORMQR, DTGEVC, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -299,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 8*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -313,7 +316,7 @@
          INFO = -12
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -14
-      ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -16
       END IF
 *
@@ -321,13 +324,13 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL DGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX(1, 8*N, 3*N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL DORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK, -1,
      $                IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          IF( ILVL ) THEN
             CALL DORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          END IF
          IF( ILV ) THEN
             CALL DGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL,
@@ -336,18 +339,21 @@
             CALL DLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          ELSE
             CALL DGGHD3( 'N', 'N', N, 1, N, A, LDA, B, LDB, VL, LDVL,
      $                   VR, LDVR, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
             CALL DLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK ( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = LWKOPT
          END IF
-
-         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -367,7 +373,6 @@
       EPS = DLAMCH( 'P' )
       SMLNUM = DLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/dgghd3.f b/lapack-netlib/SRC/dgghd3.f
index 43d7a77df..21a668573 100644
--- a/lapack-netlib/SRC/dgghd3.f
+++ b/lapack-netlib/SRC/dgghd3.f
@@ -179,14 +179,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
-*> \param[in]  LWORK
+*> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -211,7 +211,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -275,7 +275,12 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
       WORK( 1 ) = DBLE( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
@@ -325,7 +330,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = ONE
          RETURN
@@ -885,6 +889,7 @@
       IF ( JCOL.LT.IHI )
      $   CALL DGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
+*
       WORK( 1 ) = DBLE( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/dggqrf.f b/lapack-netlib/SRC/dggqrf.f
index 39d27a5c9..edac7f22f 100644
--- a/lapack-netlib/SRC/dggqrf.f
+++ b/lapack-netlib/SRC/dggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -250,7 +250,7 @@
       NB2 = ILAENV( 1, 'DGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'DORMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
@@ -287,6 +287,7 @@
 *     RQ factorization of N-by-P matrix B: B = T*Z.
 *
       CALL DGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
+*
       WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
 *
       RETURN
diff --git a/lapack-netlib/SRC/dggrqf.f b/lapack-netlib/SRC/dggrqf.f
index ddf4104c5..3b1024c1c 100644
--- a/lapack-netlib/SRC/dggrqf.f
+++ b/lapack-netlib/SRC/dggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -249,7 +249,7 @@
       NB2 = ILAENV( 1, 'DGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'DORMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/dggsvd3.f b/lapack-netlib/SRC/dggsvd3.f
index 503f0d8cc..ee4d11e86 100644
--- a/lapack-netlib/SRC/dggsvd3.f
+++ b/lapack-netlib/SRC/dggsvd3.f
@@ -278,7 +278,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -328,7 +328,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/dggsvp3.f b/lapack-netlib/SRC/dggsvp3.f
index 4e1db3117..485d95b36 100644
--- a/lapack-netlib/SRC/dggsvp3.f
+++ b/lapack-netlib/SRC/dggsvp3.f
@@ -227,7 +227,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -250,7 +250,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/dgsvj0.f b/lapack-netlib/SRC/dgsvj0.f
index 197a9a626..1008aa8c6 100644
--- a/lapack-netlib/SRC/dgsvj0.f
+++ b/lapack-netlib/SRC/dgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is DOUBLE PRECISION array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dgsvj1.f b/lapack-netlib/SRC/dgsvj1.f
index 11d1dde2b..f1a099b2a 100644
--- a/lapack-netlib/SRC/dgsvj1.f
+++ b/lapack-netlib/SRC/dgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                         sequence of Jacobi rotations.
 *>          If JOBV = 'N', then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is DOUBLE PRECISION array, dimension (LDV,N)
-*>          If JOBV = 'V', then N rows of V are post-multipled by a
+*>          If JOBV = 'V', then N rows of V are post-multiplied by a
 *>                         sequence of Jacobi rotations.
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                         sequence of Jacobi rotations.
 *>          If JOBV = 'N', then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dla_gbrfsx_extended.f b/lapack-netlib/SRC/dla_gbrfsx_extended.f
index 5454b1bfc..b331178a9 100644
--- a/lapack-netlib/SRC/dla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/dla_gbrfsx_extended.f
@@ -645,7 +645,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/dla_gerfsx_extended.f b/lapack-netlib/SRC/dla_gerfsx_extended.f
index 92b0d76d4..bf8260d04 100644
--- a/lapack-netlib/SRC/dla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/dla_gerfsx_extended.f
@@ -625,7 +625,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y( 1, J ), 1 )
diff --git a/lapack-netlib/SRC/dla_porfsx_extended.f b/lapack-netlib/SRC/dla_porfsx_extended.f
index 5c8850fef..94d1087cd 100644
--- a/lapack-netlib/SRC/dla_porfsx_extended.f
+++ b/lapack-netlib/SRC/dla_porfsx_extended.f
@@ -617,7 +617,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/dla_porpvgrw.f b/lapack-netlib/SRC/dla_porpvgrw.f
index 93ad3eb6a..00fdd7ae1 100644
--- a/lapack-netlib/SRC/dla_porpvgrw.f
+++ b/lapack-netlib/SRC/dla_porpvgrw.f
@@ -134,9 +134,9 @@
 *
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     DPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     DPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0D+0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/dla_syrfsx_extended.f b/lapack-netlib/SRC/dla_syrfsx_extended.f
index e1cde6fc2..2e06a622b 100644
--- a/lapack-netlib/SRC/dla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/dla_syrfsx_extended.f
@@ -647,7 +647,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL DAXPY( N, 1.0D+0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/dlaed2.f b/lapack-netlib/SRC/dlaed2.f
index 9b1f1e093..1a53650e8 100644
--- a/lapack-netlib/SRC/dlaed2.f
+++ b/lapack-netlib/SRC/dlaed2.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+*       SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
 *                          Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *       .. Scalar Arguments ..
@@ -28,7 +28,7 @@
 *       .. Array Arguments ..
 *       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
 *      $                   INDXQ( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   W( * ), Z( * )
 *       ..
 *
@@ -123,9 +123,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (N)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         DLAED3 to form the secular equation.
 *> \endverbatim
@@ -148,7 +148,7 @@
 *> \param[out] INDX
 *> \verbatim
 *>          INDX is INTEGER array, dimension (N)
-*>         The permutation used to sort the contents of DLAMDA into
+*>         The permutation used to sort the contents of DLAMBDA into
 *>         ascending order.
 *> \endverbatim
 *>
@@ -207,7 +207,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+      SUBROUTINE DLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
      $                   Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *  -- LAPACK computational routine --
@@ -221,7 +221,7 @@
 *     .. Array Arguments ..
       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
      $                   INDXQ( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   W( * ), Z( * )
 *     ..
 *
@@ -300,9 +300,9 @@
 *     re-integrate the deflated parts from the last pass
 *
       DO 20 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
    20 CONTINUE
-      CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDXC )
+      CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC )
       DO 30 I = 1, N
          INDX( I ) = INDXQ( INDXC( I ) )
    30 CONTINUE
@@ -324,11 +324,11 @@
          DO 40 J = 1, N
             I = INDX( J )
             CALL DCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 )
-            DLAMDA( J ) = D( I )
+            DLAMBDA( J ) = D( I )
             IQ2 = IQ2 + N
    40    CONTINUE
          CALL DLACPY( 'A', N, N, Q2, N, Q, LDQ )
-         CALL DCOPY( N, DLAMDA, 1, D, 1 )
+         CALL DCOPY( N, DLAMBDA, 1, D, 1 )
          GO TO 190
       END IF
 *
@@ -421,7 +421,7 @@
             PJ = NJ
          ELSE
             K = K + 1
-            DLAMDA( K ) = D( PJ )
+            DLAMBDA( K ) = D( PJ )
             W( K ) = Z( PJ )
             INDXP( K ) = PJ
             PJ = NJ
@@ -433,7 +433,7 @@
 *     Record the last eigenvalue.
 *
       K = K + 1
-      DLAMDA( K ) = D( PJ )
+      DLAMBDA( K ) = D( PJ )
       W( K ) = Z( PJ )
       INDXP( K ) = PJ
 *
@@ -470,9 +470,9 @@
          PSM( CT ) = PSM( CT ) + 1
   130 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       I = 1
diff --git a/lapack-netlib/SRC/dlaed3.f b/lapack-netlib/SRC/dlaed3.f
index c58944e60..f9982c89e 100644
--- a/lapack-netlib/SRC/dlaed3.f
+++ b/lapack-netlib/SRC/dlaed3.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+*       SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
 *                          CTOT, W, S, INFO )
 *
 *       .. Scalar Arguments ..
@@ -27,7 +27,7 @@
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            CTOT( * ), INDX( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   S( * ), W( * )
 *       ..
 *
@@ -44,12 +44,6 @@
 *> being combined by the matrix of eigenvectors of the K-by-K system
 *> which is solved here.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -104,14 +98,12 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in,out] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (K)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
-*>          of the secular equation. May be changed on output by
-*>          having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
-*>          Cray-2, or Cray C-90, as described above.
+*>          of the secular equation.
 *> \endverbatim
 *>
 *> \param[in] Q2
@@ -180,7 +172,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+      SUBROUTINE DLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
      $                   CTOT, W, S, INFO )
 *
 *  -- LAPACK computational routine --
@@ -193,7 +185,7 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            CTOT( * ), INDX( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   S( * ), W( * )
 *     ..
 *
@@ -208,8 +200,8 @@
       DOUBLE PRECISION   TEMP
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DLAMC3, DNRM2
-      EXTERNAL           DLAMC3, DNRM2
+      DOUBLE PRECISION   DNRM2
+      EXTERNAL           DNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DCOPY, DGEMM, DLACPY, DLAED4, DLASET, XERBLA
@@ -240,29 +232,9 @@
       IF( K.EQ.0 )
      $   RETURN
 *
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = 1, K
-         CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -293,10 +265,10 @@
       CALL DCOPY( K, Q, LDQ+1, W, 1 )
       DO 60 J = 1, K
          DO 40 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    40    CONTINUE
          DO 50 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
    60 CONTINUE
       DO 70 I = 1, K
diff --git a/lapack-netlib/SRC/dlaed8.f b/lapack-netlib/SRC/dlaed8.f
index 3631fb456..5d1d9144d 100644
--- a/lapack-netlib/SRC/dlaed8.f
+++ b/lapack-netlib/SRC/dlaed8.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-*                          CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+*                          CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *       .. Scalar Arguments ..
@@ -30,7 +30,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
 *      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *       ..
 *
@@ -141,9 +141,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (N)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         DLAED3 to form the secular equation.
 *> \endverbatim
@@ -238,7 +238,7 @@
 *
 *  =====================================================================
       SUBROUTINE DLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-     $                   CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+     $                   CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *  -- LAPACK computational routine --
@@ -253,7 +253,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *     ..
 *
@@ -339,14 +339,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -464,7 +464,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -476,26 +476,26 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   110 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       IF( ICOMPQ.EQ.0 ) THEN
          DO 120 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
   120    CONTINUE
       ELSE
          DO 130 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
             CALL DCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   130    CONTINUE
@@ -506,9 +506,9 @@
 *
       IF( K.LT.N ) THEN
          IF( ICOMPQ.EQ.0 ) THEN
-            CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          ELSE
-            CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
             CALL DLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2,
      $                   Q( 1, K+1 ), LDQ )
          END IF
diff --git a/lapack-netlib/SRC/dlaed9.f b/lapack-netlib/SRC/dlaed9.f
index b88cdd907..0d209c2c2 100644
--- a/lapack-netlib/SRC/dlaed9.f
+++ b/lapack-netlib/SRC/dlaed9.f
@@ -18,15 +18,15 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-*                          S, LDS, INFO )
+*       SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+*                          W, S, LDS, INFO )
 *
 *       .. Scalar Arguments ..
 *       INTEGER            INFO, K, KSTART, KSTOP, LDQ, LDS, N
 *       DOUBLE PRECISION   RHO
 *       ..
 *       .. Array Arguments ..
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
 *      $                   W( * )
 *       ..
 *
@@ -96,9 +96,9 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (K)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
 *>          of the secular equation.
@@ -151,8 +151,8 @@
 *> at Berkeley, USA
 *
 *  =====================================================================
-      SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-     $                   S, LDS, INFO )
+      SUBROUTINE DLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+     $                   W, S, LDS, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -163,7 +163,7 @@
       DOUBLE PRECISION   RHO
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
      $                   W( * )
 *     ..
 *
@@ -174,8 +174,8 @@
       DOUBLE PRECISION   TEMP
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DLAMC3, DNRM2
-      EXTERNAL           DLAMC3, DNRM2
+      DOUBLE PRECISION   DNRM2
+      EXTERNAL           DNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DCOPY, DLAED4, XERBLA
@@ -212,30 +212,9 @@
 *
       IF( K.EQ.0 )
      $   RETURN
-*
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, N
-         DLAMDA( I ) = DLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = KSTART, KSTOP
-         CALL DLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL DLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -261,10 +240,10 @@
       CALL DCOPY( K, Q, LDQ+1, W, 1 )
       DO 70 J = 1, K
          DO 50 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
          DO 60 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    60    CONTINUE
    70 CONTINUE
       DO 80 I = 1, K
diff --git a/lapack-netlib/SRC/dlals0.f b/lapack-netlib/SRC/dlals0.f
index cfca22280..928405e22 100644
--- a/lapack-netlib/SRC/dlals0.f
+++ b/lapack-netlib/SRC/dlals0.f
@@ -389,6 +389,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = POLES( I, 2 )*Z( I ) /
      $                           ( DLAMC3( POLES( I, 2 ), DSIGJ )-
      $                           DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -440,6 +445,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1,
      $                           2 ) )-DIFR( I, 1 ) ) /
      $                           ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/dlalsa.f b/lapack-netlib/SRC/dlalsa.f
index da8e0fa17..d89f5d2f9 100644
--- a/lapack-netlib/SRC/dlalsa.f
+++ b/lapack-netlib/SRC/dlalsa.f
@@ -43,9 +43,9 @@
 *>
 *> \verbatim
 *>
-*> DLALSA is an itermediate step in solving the least squares problem
+*> DLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, DLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/dlalsd.f b/lapack-netlib/SRC/dlalsd.f
index d22c45dc6..706ac4c90 100644
--- a/lapack-netlib/SRC/dlalsd.f
+++ b/lapack-netlib/SRC/dlalsd.f
@@ -47,12 +47,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f
index 70e78f4b1..07ef1bd57 100644
--- a/lapack-netlib/SRC/dlamswlq.f
+++ b/lapack-netlib/SRC/dlamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,29 +192,31 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE DLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, CTR, LW
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, CTR, LW, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -223,52 +228,60 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       INFO = 0
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -402,7 +415,8 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
+*
       RETURN
 *
 *     End of DLAMSWLQ
diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f
index 962a31476..023db5ac9 100644
--- a/lapack-netlib/SRC/dlamtsqr.f
+++ b/lapack-netlib/SRC/dlamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,29 +193,31 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE DLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -225,12 +229,13 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +243,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +264,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL DGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -410,7 +421,8 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
+*
       RETURN
 *
 *     End of DLAMTSQR
diff --git a/lapack-netlib/SRC/dlaqp2rk.c b/lapack-netlib/SRC/dlaqp2rk.c
new file mode 100644
index 000000000..de216ad97
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp2rk.c
@@ -0,0 +1,923 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int dlaqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublereal *a, integer *lda, 
+	integer *k, doublereal *maxc2nrmk, doublereal *relmaxc2nrmk, integer *
+	jpiv, doublereal *tau, doublereal *vn1, doublereal *vn2, doublereal *
+	work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    doublereal aikk, temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    doublereal temp2;
+    integer i__, j;
+    doublereal tol3z;
+    integer jmaxc2nrm;
+    extern /* Subroutine */ int dlarf_(char *, integer *, integer *, 
+	    doublereal *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *);
+    integer itemp;
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, 
+	    doublereal *, integer *);
+    integer minmnfact;
+    doublereal myhugeval;
+    integer minmnupdt, kk;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *,
+	     integer *, doublereal *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + idamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = ZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    dlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    tau[kk] = 0.;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since DLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by DLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by DLARFG is covered by checking TAU(KK) for NaN. */
+
+	if (disnan_(&tau[kk])) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[kk];
+	    *relmaxc2nrmk = tau[kk];
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    aikk = a[i__ + kk * a_dim1];
+	    a[i__ + kk * a_dim1] = 1.;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    dlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[
+		    kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + kk * a_dim1] = aikk;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    d__2 = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j];
+		    temp = 1. - d__2 * d__2;
+		    temp = f2cmax(temp,0.);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = dnrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	tau[j] = 0.;
+    }
+
+    return 0;
+
+/*     End of DLAQP2RK */
+
+} /* dlaqp2rk_ */
+
diff --git a/lapack-netlib/SRC/dlaqp2rk.f b/lapack-netlib/SRC/dlaqp2rk.f
new file mode 100644
index 000000000..aecd6bb69
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp2rk.f
@@ -0,0 +1,713 @@
+*> \brief \b DLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlaqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlaqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlaqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      DOUBLE PRECISION   A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+*     $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of a real matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine DGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine DGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (N-1)
+*>          Used in DLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      DOUBLE PRECISION   A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      DOUBLE PRECISION   AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLARF, DLARFG, DSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL DSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = ZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL DLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = ZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since DLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by DLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by DLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( DISNAN( TAU(KK) ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( KK )
+            RELMAXC2NRMK = TAU( KK )
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = ONE
+            CALL DLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = DNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = ZERO
+      END DO
+*
+      RETURN
+*
+*     End of DLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/dlaqp3rk.c b/lapack-netlib/SRC/dlaqp3rk.c
new file mode 100644
index 000000000..e8c61c257
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp3rk.c
@@ -0,0 +1,1113 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static doublereal c_b7 = -1.;
+static doublereal c_b8 = 1.;
+static doublereal c_b30 = 0.;
+
+/* Subroutine */ int dlaqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublereal *a, integer *lda, 
+	logical *done, integer *kb, doublereal *maxc2nrmk, doublereal *
+	relmaxc2nrmk, integer *jpiv, doublereal *tau, doublereal *vn1, 
+	doublereal *vn2, doublereal *auxv, doublereal *f, integer *ldf, 
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2;
+    doublereal d__1, d__2;
+
+    /* Local variables */
+    doublereal temp;
+    extern doublereal dnrm2_(integer *, doublereal *, integer *);
+    doublereal temp2;
+    integer i__, j, k;
+    doublereal tol3z;
+    extern /* Subroutine */ int dgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublereal *, doublereal *, integer *, doublereal *, 
+	    integer *, doublereal *, doublereal *, integer *),
+	     dgemv_(char *, integer *, integer *, doublereal *, doublereal *, 
+	    integer *, doublereal *, integer *, doublereal *, doublereal *, 
+	    integer *);
+    integer itemp;
+    extern /* Subroutine */ int dswap_(integer *, doublereal *, integer *, 
+	    doublereal *, integer *);
+    integer minmnfact;
+    doublereal myhugeval;
+    integer minmnupdt, if__;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern /* Subroutine */ int dlarfg_(integer *, doublereal *, doublereal *,
+	     integer *, doublereal *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    integer lsticc;
+    doublereal aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + idamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    dswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    dswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    dgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1);
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    dlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    tau[k] = 0.;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since DLARFG cannot produce TAU(K) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by DLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by DLARFG is covered by checking TAU(K) for NaN. */
+
+	if (disnan_(&tau[k])) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[k];
+	    *relmaxc2nrmk = tau[k];
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		dgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, &
+			a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, &
+			c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	aik = a[i__ + k * a_dim1];
+	a[i__ + k * a_dim1] = 1.;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    dgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + 
+		    1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    f[j + k * f_dim1] = 0.;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    d__1 = -tau[k];
+	    dgemv_("Transpose", &i__1, &i__2, &d__1, &a[i__ + a_dim1], lda, &
+		    a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    dgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    dgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, 
+		    &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], 
+		    lda);
+	}
+
+	a[i__ + k * a_dim1] = aik;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = (d__1 = a[i__ + j * a_dim1], abs(d__1)) / vn1[j];
+/* Computing MAX */
+		    d__1 = 0., d__2 = (temp + 1.) * (1. - temp);
+		    temp = f2cmax(d__1,d__2);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	dgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ 
+		+ 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ 
+		+ 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        DNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(DLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = dnrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of DLAQP3RK */
+
+} /* dlaqp3rk_ */
+
diff --git a/lapack-netlib/SRC/dlaqp3rk.f b/lapack-netlib/SRC/dlaqp3rk.f
new file mode 100644
index 000000000..8139345ed
--- /dev/null
+++ b/lapack-netlib/SRC/dlaqp3rk.f
@@ -0,0 +1,935 @@
+*> \brief \b DLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlaqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlaqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlaqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*
+*     .. Scalar Arguments ..
+*      LOGICAL            DONE
+*      INTEGER            KB, LDA, LDF, M, N, NB, NRHS, IOFFSET
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      DOUBLE PRECISION   A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+*     $                   VN1( * ), VN2( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine DGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine DGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is DOUBLE PRECISION array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is DOUBLE PRECISION array, dimension (LDF,NB)
+*>          Matrix F**T = L*(Y**T)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+     $                   VN1( * ), VN2( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      DOUBLE PRECISION   AIK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGEMM, DGEMV, DLARFG, DSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL DGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL DSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL DSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T.
+*
+         IF( K.GT.1 ) THEN
+            CALL DGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 )
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL DLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = ZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since DLARFG cannot produce TAU(K) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by DLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by DLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( DISNAN( TAU(K) ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( K )
+            RELMAXC2NRMK = TAU( K )
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL DGEMM( 'No transpose', 'Transpose',
+     $               M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = ONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL DGEMV( 'Transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  ZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = ZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL DGEMV( 'Transpose', M-I+1, K-1, -TAU( K ),
+     $                  A( I, 1 ), LDA, A( I, K ), 1, ZERO,
+     $                  AUXV( 1 ), 1 )
+*
+            CALL DGEMV( 'No transpose', N+NRHS, K-1, ONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL DGEMV( 'No transpose', N+NRHS-K, K, -ONE,
+     $                  F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE,
+     $                  A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL DGEMM( 'No transpose', 'Transpose',
+     $         M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        DNRM2 does not fail on vectors with norm below the value of
+*        SQRT(DLAMCH('S'))
+*
+         VN1( LSTICC ) = DNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of DLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/dlaqz0.f b/lapack-netlib/SRC/dlaqz0.f
index c4cb95fd3..84cb96bcb 100644
--- a/lapack-netlib/SRC/dlaqz0.f
+++ b/lapack-netlib/SRC/dlaqz0.f
@@ -102,7 +102,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -332,7 +332,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4, DLABAD,
+      EXTERNAL :: XERBLA, DHGEQZ, DLASET, DLAQZ3, DLAQZ4,
      $            DLARTG, DROT
       DOUBLE PRECISION, EXTERNAL :: DLAMCH, DLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -482,7 +482,6 @@
 *     Get machine constants
       SAFMIN = DLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       ULP = DLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( DBLE( N )/ULP )
 
@@ -567,7 +566,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f
index 10ab6461e..2c7ea59b0 100644
--- a/lapack-netlib/SRC/dlarfb_gett.f
+++ b/lapack-netlib/SRC/dlarfb_gett.f
@@ -451,7 +451,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/dlarfgp.f b/lapack-netlib/SRC/dlarfgp.f
index 69845056d..a8cf1b31e 100644
--- a/lapack-netlib/SRC/dlarfgp.f
+++ b/lapack-netlib/SRC/dlarfgp.f
@@ -97,7 +97,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup larfgp
 *
 *  =====================================================================
       SUBROUTINE DLARFGP( N, ALPHA, X, INCX, TAU )
@@ -122,7 +122,7 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            J, KNT
-      DOUBLE PRECISION   BETA, BIGNUM, SAVEALPHA, SMLNUM, XNORM
+      DOUBLE PRECISION   BETA, BIGNUM, EPS, SAVEALPHA, SMLNUM, XNORM
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH, DLAPY2, DNRM2
@@ -141,11 +141,12 @@
          RETURN
       END IF
 *
+      EPS = DLAMCH( 'Precision' )
       XNORM = DNRM2( N-1, X, INCX )
 *
-      IF( XNORM.EQ.ZERO ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
 *
-*        H  =  [+/-1, 0; I], sign chosen so ALPHA >= 0
+*        H  =  [+/-1, 0; I], sign chosen so ALPHA >= 0.
 *
          IF( ALPHA.GE.ZERO ) THEN
 *           When TAU.eq.ZERO, the vector is special-cased to be
diff --git a/lapack-netlib/SRC/dlarre.f b/lapack-netlib/SRC/dlarre.f
index 70f59b829..1cceed1a2 100644
--- a/lapack-netlib/SRC/dlarre.f
+++ b/lapack-netlib/SRC/dlarre.f
@@ -51,7 +51,7 @@
 *> DSTEMR to compute the eigenvectors of T.
 *> The accuracy varies depending on whether bisection is used to
 *> find a few eigenvalues or the dqds algorithm (subroutine DLASQ2) to
-*> conpute all and then discard any unwanted one.
+*> compute all and then discard any unwanted one.
 *> As an added benefit, DLARRE also outputs the n
 *> Gerschgorin intervals for the matrices L_i D_i L_i^T.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dlaruv.f b/lapack-netlib/SRC/dlaruv.f
index 0f5c9541d..1a4ce4009 100644
--- a/lapack-netlib/SRC/dlaruv.f
+++ b/lapack-netlib/SRC/dlaruv.f
@@ -382,6 +382,11 @@
      $                   1537 /
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return for N < 1
+      IF ( N < 1 ) THEN
+         RETURN
+      END IF
 *
       I1 = ISEED( 1 )
       I2 = ISEED( 2 )
diff --git a/lapack-netlib/SRC/dlas2.f b/lapack-netlib/SRC/dlas2.f
index ea929e86f..ff75e3942 100644
--- a/lapack-netlib/SRC/dlas2.f
+++ b/lapack-netlib/SRC/dlas2.f
@@ -93,9 +93,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows, or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows, or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/dlasd0.f b/lapack-netlib/SRC/dlasd0.f
index 215dc8717..6d8d4e2db 100644
--- a/lapack-netlib/SRC/dlasd0.f
+++ b/lapack-netlib/SRC/dlasd0.f
@@ -79,10 +79,11 @@
 *>         On exit, E has been destroyed.
 *> \endverbatim
 *>
-*> \param[out] U
+*> \param[in,out] U
 *> \verbatim
 *>          U is DOUBLE PRECISION array, dimension (LDU, N)
-*>         On exit, U contains the left singular vectors.
+*>         On exit, U contains the left singular vectors, 
+*>          if U passed in as (N, N) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDU
@@ -91,10 +92,11 @@
 *>         On entry, leading dimension of U.
 *> \endverbatim
 *>
-*> \param[out] VT
+*> \param[in,out] VT
 *> \verbatim
 *>          VT is DOUBLE PRECISION array, dimension (LDVT, M)
-*>         On exit, VT**T contains the right singular vectors.
+*>         On exit, VT**T contains the right singular vectors,
+*>          if VT passed in as (M, M) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDVT
diff --git a/lapack-netlib/SRC/dlasd3.f b/lapack-netlib/SRC/dlasd3.f
index df939efc5..44957377b 100644
--- a/lapack-netlib/SRC/dlasd3.f
+++ b/lapack-netlib/SRC/dlasd3.f
@@ -44,13 +44,6 @@
 *> appropriate calls to DLASD4 and then updates the singular
 *> vectors by matrix multiplication.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> DLASD3 is called from DLASD1.
 *> \endverbatim
 *
@@ -103,7 +96,7 @@
 *>         The leading dimension of the array Q.  LDQ >= K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is DOUBLE PRECISION array, dimension(K)
 *>         The first K elements of this array contain the old roots
@@ -249,8 +242,8 @@
       DOUBLE PRECISION   RHO, TEMP
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DLAMC3, DNRM2
-      EXTERNAL           DLAMC3, DNRM2
+      DOUBLE PRECISION   DNRM2
+      EXTERNAL           DNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DCOPY, DGEMM, DLACPY, DLASCL, DLASD4, XERBLA
@@ -310,27 +303,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DSIGMA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 20 I = 1, K
-         DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   20 CONTINUE
-*
 *     Keep a copy of Z.
 *
       CALL DCOPY( K, Z, 1, Q, 1 )
diff --git a/lapack-netlib/SRC/dlasd8.f b/lapack-netlib/SRC/dlasd8.f
index a769bdb22..73c3ef0b4 100644
--- a/lapack-netlib/SRC/dlasd8.f
+++ b/lapack-netlib/SRC/dlasd8.f
@@ -121,14 +121,12 @@
 *>          The leading dimension of DIFR, must be at least K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is DOUBLE PRECISION array, dimension ( K )
 *>          On entry, the first K elements of this array contain the old
 *>          roots of the deflated updating problem.  These are the poles
 *>          of the secular equation.
-*>          On exit, the elements of DSIGMA may be very slightly altered
-*>          in value.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -227,27 +225,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DSIGMA( I ) = DLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   10 CONTINUE
-*
 *     Book keeping.
 *
       IWK1 = 1
@@ -312,6 +289,11 @@
             DSIGJP = -DSIGMA( J+1 )
          END IF
          WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ )
+*
+*        Use calls to the subroutine DLAMC3 to enforce the parentheses
+*        (x+y)+z. The goal is to prevent optimizing compilers
+*        from doing x+(y+z).
+*
          DO 60 I = 1, J - 1
             WORK( I ) = Z( I ) / ( DLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ )
      $                   / ( DSIGMA( I )+DJ )
diff --git a/lapack-netlib/SRC/dlassq.f90 b/lapack-netlib/SRC/dlassq.f90
index fddd1bf38..37626844b 100644
--- a/lapack-netlib/SRC/dlassq.f90
+++ b/lapack-netlib/SRC/dlassq.f90
@@ -34,28 +34,15 @@
 !>
 !> \verbatim
 !>
-!> DLASSQ  returns the values  scl  and  smsq  such that
+!> DLASSQ returns the values scale_out and sumsq_out such that
 !>
-!>    ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+!>    (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq,
 !>
-!> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is
 !> assumed to be non-negative.
 !>
 !> scale and sumsq must be supplied in SCALE and SUMSQ and
-!> scl and smsq are overwritten on SCALE and SUMSQ respectively.
-!>
-!> If scale * sqrt( sumsq ) > tbig then
-!>    we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
-!> and if 0 < scale * sqrt( sumsq ) < tsml then
-!>    we require:   scale <= sqrt( HUGE ) / ssml       on entry,
-!> where
-!>    tbig -- upper threshold for values whose square is representable;
-!>    sbig -- scaling constant for big numbers; \see la_constants.f90
-!>    tsml -- lower threshold for values whose square is representable;
-!>    ssml -- scaling constant for small numbers; \see la_constants.f90
-!> and
-!>    TINY*EPS -- tiniest representable number;
-!>    HUGE     -- biggest representable number.
+!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively.
 !>
 !> \endverbatim
 !
@@ -72,7 +59,7 @@
 !> \verbatim
 !>          X is DOUBLE PRECISION array, dimension (1+(N-1)*abs(INCX))
 !>          The vector for which a scaled sum of squares is computed.
-!>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+!>             x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 !> \endverbatim
 !>
 !> \param[in] INCX
@@ -82,24 +69,24 @@
 !>          If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX = 0, x isn't a vector so there is no need to call
-!>          this subroutine.  If you call it anyway, it will count x(1)
+!>          this subroutine. If you call it anyway, it will count x(1)
 !>          in the vector norm N times.
 !> \endverbatim
 !>
 !> \param[in,out] SCALE
 !> \verbatim
 !>          SCALE is DOUBLE PRECISION
-!>          On entry, the value  scale  in the equation above.
-!>          On exit, SCALE is overwritten with  scl , the scaling factor
+!>          On entry, the value scale in the equation above.
+!>          On exit, SCALE is overwritten by scale_out, the scaling factor
 !>          for the sum of squares.
 !> \endverbatim
 !>
 !> \param[in,out] SUMSQ
 !> \verbatim
 !>          SUMSQ is DOUBLE PRECISION
-!>          On entry, the value  sumsq  in the equation above.
-!>          On exit, SUMSQ is overwritten with  smsq , the basic sum of
-!>          squares from which  scl  has been factored out.
+!>          On entry, the value sumsq in the equation above.
+!>          On exit, SUMSQ is overwritten by sumsq_out, the basic sum of
+!>          squares from which scale_out has been factored out.
 !> \endverbatim
 !
 !  Authors:
@@ -130,10 +117,10 @@
 !>
 !> \endverbatim
 !
-!> \ingroup OTHERauxiliary
+!> \ingroup lassq
 !
 !  =====================================================================
-subroutine DLASSQ( n, x, incx, scl, sumsq )
+subroutine DLASSQ( n, x, incx, scale, sumsq )
    use LA_CONSTANTS, &
       only: wp=>dp, zero=>dzero, one=>done, &
             sbig=>dsbig, ssml=>dssml, tbig=>dtbig, tsml=>dtsml
@@ -145,7 +132,7 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
 !
 !  .. Scalar Arguments ..
    integer :: incx, n
-   real(wp) :: scl, sumsq
+   real(wp) :: scale, sumsq
 !  ..
 !  .. Array Arguments ..
    real(wp) :: x(*)
@@ -158,10 +145,10 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
 !
 !  Quick return if possible
 !
-   if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return
-   if( sumsq == zero ) scl = one
-   if( scl == zero ) then
-      scl = one
+   if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return
+   if( sumsq == zero ) scale = one
+   if( scale == zero ) then
+      scale = one
       sumsq = zero
    end if
    if (n <= 0) then
@@ -198,15 +185,27 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
 !  Put the existing sum of squares into one of the accumulators
 !
    if( sumsq > zero ) then
-      ax = scl*sqrt( sumsq )
+      ax = scale*sqrt( sumsq )
       if (ax > tbig) then
-!        We assume scl >= sqrt( TINY*EPS ) / sbig
-         abig = abig + (scl*sbig)**2 * sumsq
+         if (scale > one) then
+            scale = scale * sbig
+            abig = abig + scale * (scale * sumsq)
+         else
+            ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable
+            abig = abig + scale * (scale * (sbig * (sbig * sumsq)))
+         end if
       else if (ax < tsml) then
-!        We assume scl <= sqrt( HUGE ) / ssml
-         if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+         if (notbig) then
+            if (scale < one) then
+               scale = scale * ssml
+               asml = asml + scale * (scale * sumsq)
+            else
+               ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable
+               asml = asml + scale * (scale * (ssml * (ssml * sumsq)))
+            end if
+         end if
       else
-         amed = amed + scl**2 * sumsq
+         amed = amed + scale * (scale * sumsq)
       end if
    end if
 !
@@ -220,7 +219,7 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
       if (amed > zero .or. LA_ISNAN(amed)) then
          abig = abig + (amed*sbig)*sbig
       end if
-      scl = one / sbig
+      scale = one / sbig
       sumsq = abig
    else if (asml > zero) then
 !
@@ -236,17 +235,17 @@ subroutine DLASSQ( n, x, incx, scl, sumsq )
             ymin = asml
             ymax = amed
          end if
-         scl = one
+         scale = one
          sumsq = ymax**2*( one + (ymin/ymax)**2 )
       else
-         scl = one / ssml
+         scale = one / ssml
          sumsq = asml
       end if
    else
 !
 !     Otherwise all values are mid-range or zero
 !
-      scl = one
+      scale = one
       sumsq = amed
    end if
    return
diff --git a/lapack-netlib/SRC/dlasv2.f b/lapack-netlib/SRC/dlasv2.f
index 64a06dee1..cb2bf51c4 100644
--- a/lapack-netlib/SRC/dlasv2.f
+++ b/lapack-netlib/SRC/dlasv2.f
@@ -124,9 +124,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f
index fb8857145..636c12dc8 100644
--- a/lapack-netlib/SRC/dlaswlq.f
+++ b/lapack-netlib/SRC/dlaswlq.f
@@ -99,18 +99,22 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
-*>          The dimension of the array WORK.  LWORK >= MB*M.
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -158,33 +162,37 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION  A( LDA, * ), WORK( * ), T( LDT, *)
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           DGELQT, DTPLQT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -195,12 +203,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LT.0 ) THEN
         INFO = -4
@@ -208,60 +223,62 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
-        CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
+        CALL DGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL DGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M, (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
-     $                  LDA, T(1, CTR * M + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL DTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+     $                 LDA, T(1, CTR * M + 1),
+     $                 LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL DTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
-     $                  LDA, T(1, CTR * M + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                 LDA, T(1, CTR * M + 1), LDT,
+     $                 WORK, INFO )
+      END IF
+*
+      WORK( 1 ) = LWMIN
 *
-      WORK( 1 ) = M * MB
       RETURN
 *
 *     End of DLASWLQ
diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f
index be156bee2..b282f4227 100644
--- a/lapack-netlib/SRC/dlatrs.f
+++ b/lapack-netlib/SRC/dlatrs.f
@@ -261,6 +261,9 @@
       DOUBLE PRECISION   BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS,
      $                   TMAX, TSCAL, USCAL, XBND, XJ, XMAX
 *     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   WORK(1)
+*     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            IDAMAX
@@ -362,7 +365,7 @@
 *              A is upper triangular.
 *
                DO J = 2, N
-                  TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ),
+                  TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ),
      $                        TMAX )
                END DO
             ELSE
@@ -371,7 +374,7 @@
 *
                DO J = 1, N - 1
                   TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1,
-     $                        SUMJ ), TMAX )
+     $                        WORK ), TMAX )
                END DO
             END IF
 *
diff --git a/lapack-netlib/SRC/dlatrs3.f b/lapack-netlib/SRC/dlatrs3.f
index b4a98bc78..d18675b2d 100644
--- a/lapack-netlib/SRC/dlatrs3.f
+++ b/lapack-netlib/SRC/dlatrs3.f
@@ -151,13 +151,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK).
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal size of
 *>          WORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -165,6 +169,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -181,7 +186,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -253,7 +258,7 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
@@ -292,15 +297,24 @@
 *     row. WORK( I+KK*LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = LWMIN
 *
 *     Test the input parameters
 *
@@ -322,7 +336,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -574,7 +588,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2-K1
@@ -649,6 +663,9 @@
             END IF
          END DO
       END DO
+*
+      WORK( 1 ) = LWMIN
+*
       RETURN
 *
 *     End of DLATRS3
diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f
index f5cbb76bb..0000aab68 100644
--- a/lapack-netlib/SRC/dlatsqr.f
+++ b/lapack-netlib/SRC/dlatsqr.f
@@ -101,14 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -160,27 +164,29 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION  A( LDA, * ), WORK( * ), T(LDT, *)
+      DOUBLE PRECISION   A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
@@ -197,6 +203,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -204,65 +217,67 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
       END IF
+*
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'DLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
+      IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL DGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
 *
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CALL DGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
 *
-       CTR = 1
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      CTR = 1
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
-     $                 T(1, CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL DTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+     $                T(1, CTR * N + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
-     $                 T(1, CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+      IF( II.LE.M ) THEN
+        CALL DTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+     $                T(1, CTR * N + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = N*NB
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of DLATSQR
diff --git a/lapack-netlib/SRC/dorbdb1.f b/lapack-netlib/SRC/dorbdb1.f
index dae18f5df..b5b2d1362 100644
--- a/lapack-netlib/SRC/dorbdb1.f
+++ b/lapack-netlib/SRC/dorbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> DORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb2.f b/lapack-netlib/SRC/dorbdb2.f
index a0dacbb16..0b4ad732c 100644
--- a/lapack-netlib/SRC/dorbdb2.f
+++ b/lapack-netlib/SRC/dorbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> DORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb3.f b/lapack-netlib/SRC/dorbdb3.f
index 7b3727956..79b10a5d3 100644
--- a/lapack-netlib/SRC/dorbdb3.f
+++ b/lapack-netlib/SRC/dorbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> DORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb4.f b/lapack-netlib/SRC/dorbdb4.f
index 08604be45..985be3277 100644
--- a/lapack-netlib/SRC/dorbdb4.f
+++ b/lapack-netlib/SRC/dorbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> DORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/dorbdb5.f b/lapack-netlib/SRC/dorbdb5.f
index 6e057a05f..cbd58ae54 100644
--- a/lapack-netlib/SRC/dorbdb5.f
+++ b/lapack-netlib/SRC/dorbdb5.f
@@ -148,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup unbdb5
 *
 *  =====================================================================
       SUBROUTINE DORBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -169,18 +169,21 @@
 *  =====================================================================
 *
 *     .. Parameters ..
+      DOUBLE PRECISION   REALZERO
+      PARAMETER          ( REALZERO = 0.0D0 )
       DOUBLE PRECISION   ONE, ZERO
       PARAMETER          ( ONE = 1.0D0, ZERO = 0.0D0 )
 *     ..
 *     .. Local Scalars ..
       INTEGER            CHILDINFO, I, J
+      DOUBLE PRECISION   EPS, NORM, SCL, SSQ
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DORBDB6, XERBLA
+      EXTERNAL           DLASSQ, DORBDB6, DSCAL, XERBLA
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DNRM2
-      EXTERNAL           DNRM2
+      DOUBLE PRECISION   DLAMCH, DNRM2
+      EXTERNAL           DLAMCH, DNRM2
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          MAX
@@ -213,16 +216,33 @@
          RETURN
       END IF
 *
-*     Project X onto the orthogonal complement of Q
+      EPS = DLAMCH( 'Precision' )
 *
-      CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2,
-     $              WORK, LWORK, CHILDINFO )
+*     Project X onto the orthogonal complement of Q if X is nonzero
 *
-*     If the projection is nonzero, then return
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL DLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL DLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
 *
-      IF( DNRM2(M1,X1,INCX1) .NE. ZERO
-     $    .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
-         RETURN
+      IF( NORM .GT. N * EPS ) THEN
+*        Scale vector to unit norm to avoid problems in the caller code.
+*        Computing the reciprocal is undesirable but
+*         * xLASCL cannot be used because of the vector increments and
+*         * the round-off error has a negligible impact on
+*           orthogonalization.
+         CALL DSCAL( M1, ONE / NORM, X1, INCX1 )
+         CALL DSCAL( M2, ONE / NORM, X2, INCX2 )
+         CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
+     $              LDQ2, WORK, LWORK, CHILDINFO )
+*
+*        If the projection is nonzero, then return
+*
+         IF( DNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
+            RETURN
+         END IF
       END IF
 *
 *     Project each standard basis vector e_1,...,e_M1 in turn, stopping
@@ -238,8 +258,8 @@
          END DO
          CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( DNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( DNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
@@ -257,8 +277,8 @@
          X2(I) = ONE
          CALL DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( DNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. DNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( DNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
diff --git a/lapack-netlib/SRC/dorbdb6.f b/lapack-netlib/SRC/dorbdb6.f
index 45c8ba8a2..3e356d001 100644
--- a/lapack-netlib/SRC/dorbdb6.f
+++ b/lapack-netlib/SRC/dorbdb6.f
@@ -41,9 +41,8 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The Euclidean norm of X must be one and the columns of Q must be
-*> orthonormal. The orthogonalized vector will be zero if and only if it
-*> lies entirely in the range of Q.
+*> The columns of Q must be orthonormal. The orthogonalized vector will
+*> be zero if and only if it lies entirely in the range of Q.
 *>
 *> The projection is computed with at most two iterations of the
 *> classical Gram-Schmidt algorithm, see
@@ -152,7 +151,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup unbdb6
 *
 *  =====================================================================
       SUBROUTINE DORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -174,7 +173,7 @@
 *
 *     .. Parameters ..
       DOUBLE PRECISION   ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.01D0, REALONE = 1.0D0,
+      PARAMETER          ( ALPHA = 0.83D0, REALONE = 1.0D0,
      $                     REALZERO = 0.0D0 )
       DOUBLE PRECISION   NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = -1.0D0, ONE = 1.0D0, ZERO = 0.0D0 )
@@ -222,14 +221,16 @@
 *
       EPS = DLAMCH( 'Precision' )
 *
+*     Compute the Euclidean norm of X
+*
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL DLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL DLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
+*
 *     First, project X onto the orthogonal complement of Q's column
 *     space
-*
-*     Christoph Conrads: In debugging mode the norm should be computed
-*     and an assertion added comparing the norm with one. Alas, Fortran
-*     never made it into 1989 when assert() was introduced into the C
-*     programming language.
-      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
diff --git a/lapack-netlib/SRC/dorgtsqr.f b/lapack-netlib/SRC/dorgtsqr.f
index 34d96b238..2e5cd40be 100644
--- a/lapack-netlib/SRC/dorgtsqr.f
+++ b/lapack-netlib/SRC/dorgtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -301,4 +302,4 @@
 *
 *     End of DORGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/dpbsv.f b/lapack-netlib/SRC/dpbsv.f
index a52e78309..2d8f06440 100644
--- a/lapack-netlib/SRC/dpbsv.f
+++ b/lapack-netlib/SRC/dpbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dpbsvx.f b/lapack-netlib/SRC/dpbsvx.f
index 1bf526fc7..142dda5da 100644
--- a/lapack-netlib/SRC/dpbsvx.f
+++ b/lapack-netlib/SRC/dpbsvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -281,10 +281,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dpbtf2.f b/lapack-netlib/SRC/dpbtf2.f
index 534629294..1284c9ec8 100644
--- a/lapack-netlib/SRC/dpbtf2.f
+++ b/lapack-netlib/SRC/dpbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpbtrf.f b/lapack-netlib/SRC/dpbtrf.f
index 8256f8938..29e9aaecd 100644
--- a/lapack-netlib/SRC/dpbtrf.f
+++ b/lapack-netlib/SRC/dpbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpftrf.f b/lapack-netlib/SRC/dpftrf.f
index 980debaf2..312dcdf84 100644
--- a/lapack-netlib/SRC/dpftrf.f
+++ b/lapack-netlib/SRC/dpftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dposv.f b/lapack-netlib/SRC/dposv.f
index ee2988e6f..cb76e9977 100644
--- a/lapack-netlib/SRC/dposv.f
+++ b/lapack-netlib/SRC/dposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dposvx.f b/lapack-netlib/SRC/dposvx.f
index 4a0b9d605..faffff803 100644
--- a/lapack-netlib/SRC/dposvx.f
+++ b/lapack-netlib/SRC/dposvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -277,10 +277,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dposvxx.f b/lapack-netlib/SRC/dposvxx.f
index e74b23c80..b5336cf63 100644
--- a/lapack-netlib/SRC/dposvxx.f
+++ b/lapack-netlib/SRC/dposvxx.f
@@ -88,7 +88,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/dpotf2.f b/lapack-netlib/SRC/dpotf2.f
index 08fa4957f..30da5c3f3 100644
--- a/lapack-netlib/SRC/dpotf2.f
+++ b/lapack-netlib/SRC/dpotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpotrf.f b/lapack-netlib/SRC/dpotrf.f
index 1679fc3cd..65509feb8 100644
--- a/lapack-netlib/SRC/dpotrf.f
+++ b/lapack-netlib/SRC/dpotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpotrf2.f b/lapack-netlib/SRC/dpotrf2.f
index 6c28ce6d6..aaf9b9c58 100644
--- a/lapack-netlib/SRC/dpotrf2.f
+++ b/lapack-netlib/SRC/dpotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dppsv.f b/lapack-netlib/SRC/dppsv.f
index 435703b08..1888005d9 100644
--- a/lapack-netlib/SRC/dppsv.f
+++ b/lapack-netlib/SRC/dppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dppsvx.f b/lapack-netlib/SRC/dppsvx.f
index cb41d39ee..3b08fc821 100644
--- a/lapack-netlib/SRC/dppsvx.f
+++ b/lapack-netlib/SRC/dppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dpptrf.f b/lapack-netlib/SRC/dpptrf.f
index 2d8de5110..d9eed910d 100644
--- a/lapack-netlib/SRC/dpptrf.f
+++ b/lapack-netlib/SRC/dpptrf.f
@@ -79,8 +79,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/dpteqr.f b/lapack-netlib/SRC/dpteqr.f
index aa1f1a80c..d07b065b0 100644
--- a/lapack-netlib/SRC/dpteqr.f
+++ b/lapack-netlib/SRC/dpteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/dptsv.f b/lapack-netlib/SRC/dptsv.f
index addc34b88..41d8cff15 100644
--- a/lapack-netlib/SRC/dptsv.f
+++ b/lapack-netlib/SRC/dptsv.f
@@ -93,8 +93,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dptsvx.f b/lapack-netlib/SRC/dptsvx.f
index 7fb6cf436..fcbf5aa3b 100644
--- a/lapack-netlib/SRC/dptsvx.f
+++ b/lapack-netlib/SRC/dptsvx.f
@@ -59,7 +59,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**T*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -199,10 +199,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/dpttrf.f b/lapack-netlib/SRC/dpttrf.f
index e0022e3ad..156e845a3 100644
--- a/lapack-netlib/SRC/dpttrf.f
+++ b/lapack-netlib/SRC/dpttrf.f
@@ -70,8 +70,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsbevd.f b/lapack-netlib/SRC/dsbevd.f
index 3eb4ed8df..350c0a9f0 100644
--- a/lapack-netlib/SRC/dsbevd.f
+++ b/lapack-netlib/SRC/dsbevd.f
@@ -40,12 +40,6 @@
 *> a real symmetric band matrix A. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsbevd_2stage.f b/lapack-netlib/SRC/dsbevd_2stage.f
index 45a64b478..82997c850 100644
--- a/lapack-netlib/SRC/dsbevd_2stage.f
+++ b/lapack-netlib/SRC/dsbevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsbgvd.f b/lapack-netlib/SRC/dsbgvd.f
index 30b016611..0ab3177ac 100644
--- a/lapack-netlib/SRC/dsbgvd.f
+++ b/lapack-netlib/SRC/dsbgvd.f
@@ -43,12 +43,6 @@
 *> banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsbgvx.f b/lapack-netlib/SRC/dsbgvx.f
index 20de17931..53deae580 100644
--- a/lapack-netlib/SRC/dsbgvx.f
+++ b/lapack-netlib/SRC/dsbgvx.f
@@ -267,7 +267,7 @@
 *>                  Their indices are stored in IFAIL.
 *>          > N:  DPBSTF returned an error code; i.e.,
 *>                if INFO = N + i, for 1 <= i <= N, then the leading
-*>                minor of order i of B is not positive definite.
+*>                principal minor of order i of B is not positive.
 *>                The factorization of B could not be completed and
 *>                no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dspevd.f b/lapack-netlib/SRC/dspevd.f
index d9d6c8917..05aa91b03 100644
--- a/lapack-netlib/SRC/dspevd.f
+++ b/lapack-netlib/SRC/dspevd.f
@@ -40,12 +40,6 @@
 *> of a real symmetric matrix A in packed storage. If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dspgv.f b/lapack-netlib/SRC/dspgv.f
index d8ec3b1a4..96041c301 100644
--- a/lapack-netlib/SRC/dspgv.f
+++ b/lapack-netlib/SRC/dspgv.f
@@ -139,7 +139,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero.
 *>             > N:   if INFO = n + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dspgvd.f b/lapack-netlib/SRC/dspgvd.f
index df215ae1a..24c2309c3 100644
--- a/lapack-netlib/SRC/dspgvd.f
+++ b/lapack-netlib/SRC/dspgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -184,7 +178,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dspgvx.f b/lapack-netlib/SRC/dspgvx.f
index ec93147aa..5afd73d02 100644
--- a/lapack-netlib/SRC/dspgvx.f
+++ b/lapack-netlib/SRC/dspgvx.f
@@ -245,7 +245,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsposv.f b/lapack-netlib/SRC/dsposv.f
index c3b8de0e3..0bd75698d 100644
--- a/lapack-netlib/SRC/dsposv.f
+++ b/lapack-netlib/SRC/dsposv.f
@@ -177,8 +177,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of (DOUBLE
-*>                PRECISION) A is not positive definite, so the
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of (DOUBLE PRECISION) A is not positive, so the
 *>                factorization could not be completed, and the solution
 *>                has not been computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dstedc.f b/lapack-netlib/SRC/dstedc.f
index 2ed84afaa..6d533664b 100644
--- a/lapack-netlib/SRC/dstedc.f
+++ b/lapack-netlib/SRC/dstedc.f
@@ -42,12 +42,6 @@
 *> found if DSYTRD or DSPTRD or DSBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See DLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dstegr.f b/lapack-netlib/SRC/dstegr.f
index 01ec101d8..598c60e8d 100644
--- a/lapack-netlib/SRC/dstegr.f
+++ b/lapack-netlib/SRC/dstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : DSTEGR and DSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dstemr.f b/lapack-netlib/SRC/dstemr.f
index d0c71ddd9..44a33423e 100644
--- a/lapack-netlib/SRC/dstemr.f
+++ b/lapack-netlib/SRC/dstemr.f
@@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE DSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -344,7 +345,8 @@
      $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -380,6 +382,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -502,6 +505,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -509,8 +521,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -533,8 +550,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
diff --git a/lapack-netlib/SRC/dstevd.f b/lapack-netlib/SRC/dstevd.f
index 507f39b2b..54717df3d 100644
--- a/lapack-netlib/SRC/dstevd.f
+++ b/lapack-netlib/SRC/dstevd.f
@@ -40,12 +40,6 @@
 *> real symmetric tridiagonal matrix. If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsyconvf.f b/lapack-netlib/SRC/dsyconvf.f
index 9e7a5af0e..005c98a03 100644
--- a/lapack-netlib/SRC/dsyconvf.f
+++ b/lapack-netlib/SRC/dsyconvf.f
@@ -39,7 +39,7 @@
 *> DSYTRF provided on entry in parameter A into the factorization
 *> output format used in DSYTRF_RK (or DSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in DSYTRF into
+*> the interchanges stored in IPIV from the format used in DSYTRF into
 *> the format used in DSYTRF_RK (or DSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or DSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in DSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in DSYTRF_RK
+*> the interchanges stored in IPIV from the format used in DSYTRF_RK
 *> (or DSYTRF_BK) into the format used in DSYTRF.
 *> \endverbatim
 *
@@ -322,7 +322,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -466,7 +466,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -532,7 +532,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/dsyconvf_rook.f b/lapack-netlib/SRC/dsyconvf_rook.f
index d7f529358..c3f2083bd 100644
--- a/lapack-netlib/SRC/dsyconvf_rook.f
+++ b/lapack-netlib/SRC/dsyconvf_rook.f
@@ -517,7 +517,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f
index 50d51d992..286366bfe 100644
--- a/lapack-netlib/SRC/dsyev_2stage.f
+++ b/lapack-netlib/SRC/dsyev_2stage.f
@@ -20,7 +20,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+*       SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
 *                                INFO )
 *
 *       IMPLICIT NONE
@@ -97,7 +97,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension LWORK
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -105,12 +105,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 2*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -143,7 +143,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -161,7 +161,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -169,16 +169,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
 *  =====================================================================
-      SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+      SUBROUTINE DSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
      $                         INFO )
 *
       IMPLICIT NONE
@@ -305,7 +305,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
-     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call DSTERF.  For eigenvectors, first call
diff --git a/lapack-netlib/SRC/dsyevd.f b/lapack-netlib/SRC/dsyevd.f
index eaaecd8d9..adcfcb373 100644
--- a/lapack-netlib/SRC/dsyevd.f
+++ b/lapack-netlib/SRC/dsyevd.f
@@ -40,13 +40,6 @@
 *> real symmetric matrix A. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> Because of large use of BLAS of level 3, DSYEVD needs N**2 more
 *> workspace than DSYEVX.
 *> \endverbatim
@@ -103,8 +96,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array,
-*>                                         dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -167,7 +159,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevd
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/dsyevd_2stage.f b/lapack-netlib/SRC/dsyevd_2stage.f
index 0eae8ad06..d5a68c35d 100644
--- a/lapack-netlib/SRC/dsyevd_2stage.f
+++ b/lapack-netlib/SRC/dsyevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dsyevr.f b/lapack-netlib/SRC/dsyevr.f
index 698691533..8647b0162 100644
--- a/lapack-netlib/SRC/dsyevr.f
+++ b/lapack-netlib/SRC/dsyevr.f
@@ -271,7 +271,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,26*N).
+*>          The dimension of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 26*N.
 *>          For optimal efficiency, LWORK >= (NB+6)*N,
 *>          where NB is the max of the blocksize for DSYTRD and DORMTR
 *>          returned by ILAENV.
@@ -285,13 +286,14 @@
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
-*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
 *> \endverbatim
 *>
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the IWORK array,
@@ -315,7 +317,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -390,8 +392,13 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
 *
-      LWMIN = MAX( 1, 26*N )
-      LIWMIN = MAX( 1, 10*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 26*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -450,7 +457,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 7
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = A( 1, 1 )
diff --git a/lapack-netlib/SRC/dsyevr_2stage.f b/lapack-netlib/SRC/dsyevr_2stage.f
index 09242bbd3..63d5e3159 100644
--- a/lapack-netlib/SRC/dsyevr_2stage.f
+++ b/lapack-netlib/SRC/dsyevr_2stage.f
@@ -263,7 +263,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of DSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the orthogonal transformations applied by DORMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -277,12 +277,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 5*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 5*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -300,13 +301,14 @@
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
-*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
 *> \endverbatim
 *>
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the IWORK array,
@@ -330,7 +332,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -358,7 +360,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -366,11 +368,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -444,8 +446,14 @@
       IB     = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -484,7 +492,7 @@
 *         NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
 *         NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) )
 *         LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWMIN
+         WORK( 1 )  = LWMIN
          IWORK( 1 ) = LIWMIN
       END IF
 *
@@ -504,7 +512,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 7
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = A( 1, 1 )
@@ -608,7 +616,7 @@
 *     Call DSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
 *
 *
-      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+      CALL DSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
      $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
      $                    LHTRD, WORK( INDWK ), LLWORK, IINFO )
 *
@@ -727,7 +735,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 )  = LWMIN
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/dsyevx.f b/lapack-netlib/SRC/dsyevx.f
index 99719874b..fd6a78e32 100644
--- a/lapack-netlib/SRC/dsyevx.f
+++ b/lapack-netlib/SRC/dsyevx.f
@@ -244,7 +244,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYeigen
+*> \ingroup heevx
 *
 *  =====================================================================
       SUBROUTINE DSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU,
@@ -338,14 +338,14 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = LWKMIN
+            LWKOPT = 1
          ELSE
             LWKMIN = 8*N
             NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'DORMTR', UPLO, N, -1, -1, -1 ) )
             LWKOPT = MAX( LWKMIN, ( NB + 3 )*N )
-            WORK( 1 ) = LWKOPT
          END IF
+         WORK( 1 ) = LWKOPT
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -17
diff --git a/lapack-netlib/SRC/dsygv.f b/lapack-netlib/SRC/dsygv.f
index 5208dbb1f..02a4cc3ed 100644
--- a/lapack-netlib/SRC/dsygv.f
+++ b/lapack-netlib/SRC/dsygv.f
@@ -154,7 +154,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygv_2stage.f b/lapack-netlib/SRC/dsygv_2stage.f
index 5c71ebf94..383304267 100644
--- a/lapack-netlib/SRC/dsygv_2stage.f
+++ b/lapack-netlib/SRC/dsygv_2stage.f
@@ -173,7 +173,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygvd.f b/lapack-netlib/SRC/dsygvd.f
index 3b38665a7..41a384c80 100644
--- a/lapack-netlib/SRC/dsygvd.f
+++ b/lapack-netlib/SRC/dsygvd.f
@@ -42,12 +42,6 @@
 *> B are assumed to be symmetric and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -190,7 +184,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsygvx.f b/lapack-netlib/SRC/dsygvx.f
index 3fa55b97c..2dc27e8a8 100644
--- a/lapack-netlib/SRC/dsygvx.f
+++ b/lapack-netlib/SRC/dsygvx.f
@@ -270,7 +270,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f
index 8dab5a384..0a96ecd7e 100644
--- a/lapack-netlib/SRC/dsysv_aa.f
+++ b/lapack-netlib/SRC/dsysv_aa.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE DSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -177,7 +177,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -196,6 +196,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -206,17 +207,17 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_SYTRF = INT( WORK(1) )
+         LWKOPT_SYTRF = INT( WORK( 1 ) )
          CALL DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_SYTRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
+         LWKOPT_SYTRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f
index 1100702ba..90dd0a38a 100644
--- a/lapack-netlib/SRC/dsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/dsysv_aa_2stage.f
@@ -89,7 +89,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -101,14 +101,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is DOUBLE PRECISION array, dimension (LTB)
+*>          TB is DOUBLE PRECISION array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -148,14 +148,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION workspace of size LWORK
+*>          WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -179,7 +180,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYsolve
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE DSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -205,7 +206,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKMIN, LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -226,6 +227,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -234,18 +236,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -255,7 +258,6 @@
          RETURN
       END IF
 *
-*
 *     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
diff --git a/lapack-netlib/SRC/dsysvx.f b/lapack-netlib/SRC/dsysvx.f
index a30831e72..b2b8210ca 100644
--- a/lapack-netlib/SRC/dsysvx.f
+++ b/lapack-netlib/SRC/dsysvx.f
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE DSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -305,7 +305,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKMIN, LWKOPT, NB
       DOUBLE PRECISION   ANORM
 *     ..
 *     .. External Functions ..
@@ -327,6 +327,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 3*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -344,12 +345,12 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 3*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
diff --git a/lapack-netlib/SRC/dsytrd.f b/lapack-netlib/SRC/dsytrd.f
index 3dcfc3db2..58d4b633b 100644
--- a/lapack-netlib/SRC/dsytrd.f
+++ b/lapack-netlib/SRC/dsytrd.f
@@ -139,7 +139,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrd
 *
 *> \par Further Details:
 *  =====================
@@ -247,7 +247,7 @@
 *        Determine the block size.
 *
          NB = ILAENV( 1, 'DSYTRD', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f
index 8ae77d3e4..a88ac1c73 100644
--- a/lapack-netlib/SRC/dsytrd_2stage.f
+++ b/lapack-netlib/SRC/dsytrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download DSYTRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd_2stage.f"> 
+*> Download DSYTRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsytrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsytrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsytrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       DOUBLE PRECISION   A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the orthogonal
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is DOUBLE PRECISION array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2)
+*>          HOUS2 is DOUBLE PRECISION array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -143,23 +145,26 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +182,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +207,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +215,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE DSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -265,10 +270,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'DSYTRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'DSYTRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'DSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'DSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -309,14 +317,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL DSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRD_SY2SB', -INFO )
          RETURN
       END IF
-      CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL DSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRD_SB2ST', -INFO )
@@ -324,8 +332,7 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
-      WORK( 1 )  = LWMIN
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of DSYTRD_2STAGE
diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F
index fabc10756..04d03d587 100644
--- a/lapack-netlib/SRC/dsytrd_sb2st.F
+++ b/lapack-netlib/SRC/dsytrd_sb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the dsytrd_sy2sb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the dsytrd_sy2sb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the dsytrd_sy2sb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of dsytrd_sy2sb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -132,34 +132,39 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is DOUBLE PRECISION array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is DOUBLE PRECISION array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension LWORK.
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -188,7 +193,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup real16OTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +213,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +221,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE DSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 #if defined(_OPENMP)
@@ -258,11 +263,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   NBTILES, TTYPE, TID, NTHREADS,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SIDEV, SIZETAU, LDV, LHMIN, LWMIN
 *     ..
@@ -274,7 +279,7 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
+      INTEGER            ILAENV2STAGE
       EXTERNAL           LSAME, ILAENV2STAGE
 *     ..
 *     .. Executable Statements ..
@@ -282,7 +287,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -291,9 +295,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'DSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -355,7 +364,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -363,11 +372,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     real because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     real because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -382,17 +391,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = ( AB( ABDPOS, I ) )
@@ -413,7 +422,7 @@
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the symmetric band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -422,7 +431,7 @@
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL DLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL DLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -431,7 +440,7 @@
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -440,7 +449,7 @@
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -451,7 +460,7 @@
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -477,16 +486,16 @@
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) &&  _OPENMP >= 201307
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -494,20 +503,20 @@
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL DSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
-#endif 
+     $                         WORK( INDW ) )
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -522,14 +531,14 @@
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -543,11 +552,10 @@
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
       WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of DSYTRD_SB2ST
 *
       END
-      
+
diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f
index 1660b5c7e..38acc71f1 100644
--- a/lapack-netlib/SRC/dsytrd_sy2sb.f
+++ b/lapack-netlib/SRC/dsytrd_sy2sb.f
@@ -123,8 +123,8 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY)
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -158,7 +160,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -293,8 +295,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'DSYTRD_SY2SB', ' ', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f
index a39b03283..2a1a2d4dc 100644
--- a/lapack-netlib/SRC/dsytrf.f
+++ b/lapack-netlib/SRC/dsytrf.f
@@ -107,7 +107,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -135,7 +135,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -352,6 +352,7 @@
       END IF
 *
    40 CONTINUE
+*
       WORK( 1 ) = LWKOPT
       RETURN
 *
diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f
index 346737953..924d4c165 100644
--- a/lapack-netlib/SRC/dsytrf_aa.f
+++ b/lapack-netlib/SRC/dsytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= MAX(1,2*N). For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -125,10 +127,10 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -152,7 +154,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            J, LWKOPT
+      INTEGER            J, LWKMIN, LWKOPT
       INTEGER            NB, MJ, NJ, K1, K2, J1, J2, J3, JB
       DOUBLE PRECISION   ALPHA
 *     ..
@@ -179,18 +181,25 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -203,11 +212,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f
index b7f5f07c2..fae95bab2 100644
--- a/lapack-netlib/SRC/dsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -87,14 +87,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is DOUBLE PRECISION array, dimension (LTB)
+*>          TB is DOUBLE PRECISION array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION workspace of size LWORK
+*>          WORK is DOUBLE PRECISION workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used
+*>          to select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -211,9 +211,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -227,10 +227,10 @@
       NB = ILAENV( 1, 'DSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = MAX( 1, (3*NB+1)*N )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = MAX( 1, N*NB )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -239,7 +239,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f
index 7341b9263..0717eb076 100644
--- a/lapack-netlib/SRC/dsytrf_rk.f
+++ b/lapack-netlib/SRC/dsytrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension ( MAX(1,LWORK) ).
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'DSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrf_rook.f b/lapack-netlib/SRC/dsytrf_rook.f
index 2f00d1802..316663485 100644
--- a/lapack-netlib/SRC/dsytrf_rook.f
+++ b/lapack-netlib/SRC/dsytrf_rook.f
@@ -118,7 +118,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -146,7 +146,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f
index dbcdcdb58..5960d3992 100644
--- a/lapack-netlib/SRC/dsytri2.f
+++ b/lapack-netlib/SRC/dsytri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3)
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE DSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -159,9 +159,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'DSYTRI2', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +177,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = MINSIZE
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
 
-      IF( NBMAX .GE. N ) THEN
+      IF( NBMAX.GE.N ) THEN
          CALL DSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL DSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of DSYTRI2
diff --git a/lapack-netlib/SRC/dsytri_3.f b/lapack-netlib/SRC/dsytri_3.f
index 86d69cdfd..50834c605 100644
--- a/lapack-netlib/SRC/dsytri_3.f
+++ b/lapack-netlib/SRC/dsytri_3.f
@@ -119,16 +119,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (N+NB+1)*(NB+3).
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= (N+NB+1)*(NB+3).
+*>          The length of WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>
-*>          If LDWORK = -1, then a workspace query is assumed;
+*>          If LWORK = -1, then a workspace query is assumed;
 *>          the routine only calculates the optimal size of the optimal
 *>          size of the WORK array, returns this value as the first
 *>          entry of the WORK array, and no error message related to
@@ -152,7 +153,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetri_3
 *
 *> \par Contributors:
 *  ==================
@@ -208,8 +209,13 @@
 *
 *     Determine the block size
 *
-      NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) )
-      LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      IF( N.EQ.0 ) THEN
+         LWKOPT = 1
+      ELSE
+         NB = MAX( 1, ILAENV( 1, 'DSYTRI_3', UPLO, N, -1, -1, -1 ) )
+         LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      END IF
+      WORK( 1 ) = LWKOPT
 *
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
@@ -217,7 +223,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
          INFO = -8
       END IF
 *
@@ -225,7 +231,6 @@
          CALL XERBLA( 'DSYTRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f
index 26b11a2a0..f0016cb7f 100644
--- a/lapack-netlib/SRC/dsytrs_aa.f
+++ b/lapack-netlib/SRC/dsytrs_aa.f
@@ -105,7 +105,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -123,7 +129,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleSYcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE DSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -151,7 +157,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -161,13 +167,19 @@
       EXTERNAL           DLACPY, DGTSV, DSWAP, DTRSM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -178,21 +190,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'DSYTRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = LWKMIN
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/dtgevc.f b/lapack-netlib/SRC/dtgevc.f
index e7084664c..be70b2083 100644
--- a/lapack-netlib/SRC/dtgevc.f
+++ b/lapack-netlib/SRC/dtgevc.f
@@ -52,7 +52,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal blocks of S and P.
 *>
@@ -337,7 +337,7 @@
       EXTERNAL           LSAME, DLAMCH
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DGEMV, DLABAD, DLACPY, DLAG2, DLALN2, XERBLA
+      EXTERNAL           DGEMV, DLACPY, DLAG2, DLALN2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
@@ -463,7 +463,6 @@
 *
       SAFMIN = DLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, BIG )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/dtgex2.f b/lapack-netlib/SRC/dtgex2.f
index 00c23a833..1ccd92c8f 100644
--- a/lapack-netlib/SRC/dtgex2.f
+++ b/lapack-netlib/SRC/dtgex2.f
@@ -254,7 +254,7 @@
      $                   THRESHA, THRESHB
 *     ..
 *     .. Local Arrays ..
-      INTEGER            IWORK( LDST )
+      INTEGER            IWORK( LDST + 2 )
       DOUBLE PRECISION   AI( 2 ), AR( 2 ), BE( 2 ), IR( LDST, LDST ),
      $                   IRCOP( LDST, LDST ), LI( LDST, LDST ),
      $                   LICOP( LDST, LDST ), S( LDST, LDST ),
diff --git a/lapack-netlib/SRC/dtgsna.f b/lapack-netlib/SRC/dtgsna.f
index b9b3ad8af..013dc91bd 100644
--- a/lapack-netlib/SRC/dtgsna.f
+++ b/lapack-netlib/SRC/dtgsna.f
@@ -632,8 +632,8 @@
                C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA )
                C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI
                ROOT1 = C1 + SQRT( C1*C1-4.0D0*C2 )
-               ROOT2 = C2 / ROOT1
                ROOT1 = ROOT1 / TWO
+               ROOT2 = C2 / ROOT1
                COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) )
             END IF
 *
diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f
index a4651e788..c8c04ad13 100644
--- a/lapack-netlib/SRC/dtrevc3.f
+++ b/lapack-netlib/SRC/dtrevc3.f
@@ -298,7 +298,7 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'DTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
       LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f
index c44ec3808..31a5230ba 100644
--- a/lapack-netlib/SRC/dtrsyl3.f
+++ b/lapack-netlib/SRC/dtrsyl3.f
@@ -1220,7 +1220,7 @@
 *
          SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
          BUF = BUF * SCALOC
-         CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK )
+         CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) )
       END IF
 *
 *     Combine with buffer scaling factor. SCALE will be flushed if
diff --git a/lapack-netlib/SRC/ilaenv.c b/lapack-netlib/SRC/ilaenv.c
index c47224a0c..8f3b2db8e 100644
--- a/lapack-netlib/SRC/ilaenv.c
+++ b/lapack-netlib/SRC/ilaenv.c
@@ -191,7 +191,7 @@ typedef struct Namelist Namelist;
 #define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
 #ifdef _MSC_VER
 #define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
-#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/df(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
 #else
 #define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
 #define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
@@ -252,11 +252,11 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
 #define myexit_() break;
-#define mycycle() continue;
-#define myceiling(w) {ceil(w)}
-#define myhuge(w) {HUGE_VAL}
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
 //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
-#define mymaxloc(w,s,e,n) {dmaxloc_(w,*(s),*(e),n)}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
 
 /* procedure parameter types for -A and -C++ */
 
@@ -509,12 +509,18 @@ static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integ
 
 
 
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
 
 /* Table of constant values */
 
 static integer c__1 = 1;
-static real c_b174 = 0.f;
-static real c_b175 = 1.f;
+static real c_b179 = 0.f;
+static real c_b180 = 1.f;
 static integer c__0 = 0;
 
 /* > \brief \b ILAENV */
@@ -599,9 +605,9 @@ f"> */
 /* >          = 9: maximum size of the subproblems at the bottom of the */
 /* >               computation tree in the divide-and-conquer algorithm */
 /* >               (used by xGELSD and xGESDD) */
-/* >          =10: ieee NaN arithmetic can be trusted not to trap */
+/* >          =10: ieee infinity and NaN arithmetic can be trusted not to trap */
 /* >          =11: infinity arithmetic can be trusted not to trap */
-/* >          12 <= ISPEC <= 16: */
+/* >          12 <= ISPEC <= 17: */
 /* >               xHSEQR or related subroutines, */
 /* >               see IPARMQ for detailed explanation */
 /* > \endverbatim */
@@ -652,9 +658,7 @@ f"> */
 /* > \author Univ. of Colorado Denver */
 /* > \author NAG Ltd. */
 
-/* > \date November 2019 */
-
-/* > \ingroup OTHERauxiliary */
+/* > \ingroup ilaenv */
 
 /* > \par Further Details: */
 /*  ===================== */
@@ -685,7 +689,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
 	opts_len)
 {
     /* System generated locals */
-    integer ret_val;
+    integer ret_val, i__1, i__2, i__3;
 
     /* Local variables */
     logical twostage;
@@ -702,10 +706,9 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
 	    integer *, integer *);
 
 
-/*  -- LAPACK auxiliary routine (version 3.9.0) -- */
+/*  -- LAPACK auxiliary routine -- */
 /*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
 /*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
-/*     November 2019 */
 
 
 /*  ===================================================================== */
@@ -728,6 +731,7 @@ integer ilaenv_(integer *ispec, char *name__, char *opts, integer *n1,
 	case 14:  goto L160;
 	case 15:  goto L160;
 	case 16:  goto L160;
+	case 17:  goto L160;
     }
 
 /*     Invalid value for ISPEC */
@@ -908,6 +912,12 @@ L50:
 	    } else {
 		nb = 64;
 	    }
+	} else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
 	}
     } else if (s_cmp(c2, "PO", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1034,6 +1044,21 @@ L50:
 	    } else {
 		nb = 64;
 	    }
+	} else if (s_cmp(c3, "SYL", (ftnlen)3, (ftnlen)3) == 0) {
+/*           The upper bound is to prevent overly aggressive scaling. */
+	    if (sname) {
+/* Computing MIN */
+/* Computing MAX */
+		i__2 = 48, i__3 = (f2cmin(*n1,*n2) << 4) / 100;
+		i__1 = f2cmax(i__2,i__3);
+		nb = f2cmin(i__1,240);
+	    } else {
+/* Computing MIN */
+/* Computing MAX */
+		i__2 = 24, i__3 = (f2cmin(*n1,*n2) << 3) / 100;
+		i__1 = f2cmax(i__2,i__3);
+		nb = f2cmin(i__1,80);
+	    }
 	}
     } else if (s_cmp(c2, "LA", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "UUM", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1042,6 +1067,12 @@ L50:
 	    } else {
 		nb = 64;
 	    }
+	} else if (s_cmp(c3, "TRS", (ftnlen)3, (ftnlen)3) == 0) {
+	    if (sname) {
+		nb = 32;
+	    } else {
+		nb = 32;
+	    }
 	}
     } else if (sname && s_cmp(c2, "ST", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "EBZ", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1093,6 +1124,12 @@ L60:
 	    } else {
 		nbmin = 2;
 	    }
+	} else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) {
+	    if (sname) {
+		nbmin = 2;
+	    } else {
+		nbmin = 2;
+	    }
 	}
     } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) {
 	if (s_cmp(c3, "TRF", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1184,6 +1221,12 @@ L70:
 	    } else {
 		nx = 128;
 	    }
+	} else if (s_cmp(subnam + 3, "QP3RK", (ftnlen)4, (ftnlen)5) == 0) {
+	    if (sname) {
+		nx = 128;
+	    } else {
+		nx = 128;
+	    }
 	}
     } else if (s_cmp(c2, "SY", (ftnlen)2, (ftnlen)2) == 0) {
 	if (sname && s_cmp(c3, "TRD", (ftnlen)3, (ftnlen)3) == 0) {
@@ -1270,29 +1313,29 @@ L130:
 
 L140:
 
-/*     ISPEC = 10: ieee NaN arithmetic can be trusted not to trap */
+/*     ISPEC = 10: ieee and infinity NaN arithmetic can be trusted not to trap */
 
 /*     ILAENV = 0 */
     ret_val = 1;
     if (ret_val == 1) {
-	ret_val = ieeeck_(&c__1, &c_b174, &c_b175);
+	ret_val = ieeeck_(&c__1, &c_b179, &c_b180);
     }
     return ret_val;
 
 L150:
 
-/*     ISPEC = 11: infinity arithmetic can be trusted not to trap */
+/*     ISPEC = 11: ieee infinity arithmetic can be trusted not to trap */
 
 /*     ILAENV = 0 */
     ret_val = 1;
     if (ret_val == 1) {
-	ret_val = ieeeck_(&c__0, &c_b174, &c_b175);
+	ret_val = ieeeck_(&c__0, &c_b179, &c_b180);
     }
     return ret_val;
 
 L160:
 
-/*     12 <= ISPEC <= 16: xHSEQR or related subroutines. */
+/*     12 <= ISPEC <= 17: xHSEQR or related subroutines. */
 
     ret_val = iparmq_(ispec, name__, opts, n1, n2, n3, n4)
 	    ;
diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f
index a639e0375..e74a2b35e 100644
--- a/lapack-netlib/SRC/ilaenv.f
+++ b/lapack-netlib/SRC/ilaenv.f
@@ -132,7 +132,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup OTHERauxiliary
+*> \ingroup ilaenv
 *
 *> \par Further Details:
 *  =====================
@@ -355,6 +355,12 @@
             ELSE
                NB = 64
             END IF
+         ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN
+            IF( SNAME ) THEN
+               NB = 32
+            ELSE
+               NB = 32
+            END IF
          END IF
       ELSE IF( C2.EQ.'PO' ) THEN
          IF( C3.EQ.'TRF' ) THEN
@@ -541,7 +547,14 @@
             ELSE
                NBMIN = 2
             END IF
+         ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN
+            IF( SNAME ) THEN
+               NBMIN = 2
+            ELSE
+               NBMIN = 2
+            END IF
          END IF
+
       ELSE IF( C2.EQ.'SY' ) THEN
          IF( C3.EQ.'TRF' ) THEN
             IF( SNAME ) THEN
@@ -618,6 +631,12 @@
             ELSE
                NX = 128
             END IF
+         ELSE IF( SUBNAM( 4: 7 ).EQ.'QP3RK' ) THEN
+            IF( SNAME ) THEN
+               NX = 128
+            ELSE
+               NX = 128
+            END IF
          END IF
       ELSE IF( C2.EQ.'SY' ) THEN
          IF( SNAME .AND. C3.EQ.'TRD' ) THEN
diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F
index c701c2be0..52d507e9d 100644
--- a/lapack-netlib/SRC/iparam2stage.F
+++ b/lapack-netlib/SRC/iparam2stage.F
@@ -89,14 +89,14 @@
 *>
 *> \param[in] NBI
 *> \verbatim
-*>          NBI is INTEGER which is the used in the reduciton, 
+*>          NBI is INTEGER which is the used in the reduction, 
 *>          (e.g., the size of the band), needed to compute workspace
 *>          and LHOUS2.
 *> \endverbatim
 *>
 *> \param[in] IBI
 *> \verbatim
-*>          IBI is INTEGER which represent the IB of the reduciton,
+*>          IBI is INTEGER which represent the IB of the reduction,
 *>          needed to compute workspace and LHOUS2.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/sbbcsd.f b/lapack-netlib/SRC/sbbcsd.f
index d5720cb33..2a619cb71 100644
--- a/lapack-netlib/SRC/sbbcsd.f
+++ b/lapack-netlib/SRC/sbbcsd.f
@@ -805,7 +805,7 @@
                CALL SLARTGP( B22BULGE, B22E(I-1), WORK(IU2SN+I-1),
      $                       WORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL SLARTGS( B21E(I), B21E(I+1), NU, WORK(IU2CS+I-1),
+               CALL SLARTGS( B21E(I), B21D(I+1), NU, WORK(IU2CS+I-1),
      $                       WORK(IU2SN+I-1) )
             ELSE
                CALL SLARTGS( B22D(I), B22E(I), MU, WORK(IU2CS+I-1),
diff --git a/lapack-netlib/SRC/sbdsdc.f b/lapack-netlib/SRC/sbdsdc.f
index 18a404497..2a6cc9970 100644
--- a/lapack-netlib/SRC/sbdsdc.f
+++ b/lapack-netlib/SRC/sbdsdc.f
@@ -45,13 +45,6 @@
 *> respectively. SBDSDC can be used to compute all singular values,
 *> and optionally, singular vectors or singular vectors in compact form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See SLASD3 for details.
-*>
 *> The code currently calls SLASDQ if singular values only are desired.
 *> However, it can be slightly modified to compute singular values
 *> using the divide and conquer method.
diff --git a/lapack-netlib/SRC/sbdsqr.f b/lapack-netlib/SRC/sbdsqr.f
index c798baaf5..880f0607b 100644
--- a/lapack-netlib/SRC/sbdsqr.f
+++ b/lapack-netlib/SRC/sbdsqr.f
@@ -277,7 +277,7 @@
      $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       REAL               ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL,  SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -390,7 +390,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -450,7 +450,6 @@
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -459,7 +458,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -541,14 +539,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -569,14 +567,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -586,7 +584,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/sbdsvdx.f b/lapack-netlib/SRC/sbdsvdx.f
index dcd554f1d..d7cb2dc83 100644
--- a/lapack-netlib/SRC/sbdsvdx.f
+++ b/lapack-netlib/SRC/sbdsvdx.f
@@ -45,7 +45,7 @@
 *>
 *>  Given an upper bidiagonal B with diagonal D = [ d_1 d_2 ... d_N ]
 *>  and superdiagonal E = [ e_1 e_2 ... e_N-1 ], SBDSVDX computes the
-*>  singular value decompositon of B through the eigenvalues and
+*>  singular value decomposition of B through the eigenvalues and
 *>  eigenvectors of the N*2-by-N*2 tridiagonal matrix
 *>
 *>        |  0  d_1                |
diff --git a/lapack-netlib/SRC/sgebal.f b/lapack-netlib/SRC/sgebal.f
index f519c8c57..7c115fb6c 100644
--- a/lapack-netlib/SRC/sgebal.f
+++ b/lapack-netlib/SRC/sgebal.f
@@ -153,6 +153,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -181,8 +184,8 @@
       PARAMETER          ( FACTOR = 0.95E+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       REAL               C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -197,7 +200,7 @@
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
-*
+*     ..
 *     Test the input parameters
 *
       INFO = 0
@@ -214,176 +217,192 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL SSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL SSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
+*     Permutation to isolate eigenvalues if possible.
 *
-   50 CONTINUE
-      DO 70 J = L, 1, -1
-*
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( A( J, I ).NE.ZERO )
-     $         GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL SSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL SSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. A( I, J ).NE.ZERO ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL SSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL SSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( A( I, J ).NE.ZERO )
-     $         GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = SLAMCH( 'S' ) / SLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
-  140 CONTINUE
-      NOCONV = .FALSE.
-*
-      DO 200 I = K, L
-*
-         C = SNRM2( L-K+1, A( K, I ), 1 )
-         R = SNRM2( L-K+1, A( I, K ), LDA )
-         ICA = ISAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = ISAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
-*
-*        Guard against zero C or R due to underflow.
-*
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-            IF( SISNAN( C+F+CA+R+G+RA ) ) THEN
 *
-*           Exit if NaN to avoid infinite loop
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-            INFO = -3
-            CALL XERBLA( 'SGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
+         DO I = K, L
+*
+            C = SNRM2( L-K+1, A( K, I ), 1 )
+            R = SNRM2( L-K+1, A( I, K ), LDA )
+            ICA = ISAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = ISAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-         CALL SSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL SSCAL( L, F, A( 1, I ), 1 )
+*           Guard against zero C or R due to underflow.
 *
-  200 CONTINUE
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
+*
+*           Exit if NaN to avoid infinite loop
 *
-      IF( NOCONV )
-     $   GO TO 140
+            IF( SISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'SGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL SSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL SSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/sgebrd.f b/lapack-netlib/SRC/sgebrd.f
index 08701164c..b33ad0b1f 100644
--- a/lapack-netlib/SRC/sgebrd.f
+++ b/lapack-netlib/SRC/sgebrd.f
@@ -122,7 +122,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -147,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -223,27 +224,35 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEBD2, SGEMM, SLABRD, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN, REAL
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
-      WORK( 1 ) = REAL( LWKOPT )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'SGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      ENDIF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -251,7 +260,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -263,7 +272,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -282,7 +290,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
@@ -341,7 +349,8 @@
 *
       CALL SGEBD2( M-I+1, N-I+1, A( I, I ), LDA, D( I ), E( I ),
      $             TAUQ( I ), TAUP( I ), WORK, IINFO )
-      WORK( 1 ) = WS
+*
+      WORK( 1 ) = SROUNDUP_LWORK( WS )
       RETURN
 *
 *     End of SGEBRD
diff --git a/lapack-netlib/SRC/sgecon.f b/lapack-netlib/SRC/sgecon.f
index 86aeea73b..82f463ebb 100644
--- a/lapack-netlib/SRC/sgecon.f
+++ b/lapack-netlib/SRC/sgecon.f
@@ -105,8 +105,15 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
-*>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          =-5:  if ANORM is NAN or negative.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>                NaNs are illegal values for ANORM, and they propagate to
+*>                the output parameter RCOND.
+*>                Infinity is illegal for ANORM, and it propagates to the output
+*>                parameter RCOND as 0.
+*>          = 1:  if RCOND = NaN, or
+*>                   RCOND = Inf, or
+*>                   the computed norm of the inverse of A is 0.
+*>                In the latter, RCOND = 0 is returned.
 *> \endverbatim
 *
 *  Authors:
@@ -117,7 +124,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gecon
 *
 *  =====================================================================
       SUBROUTINE SGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, IWORK,
@@ -147,7 +154,7 @@
       LOGICAL            ONENRM
       CHARACTER          NORMIN
       INTEGER            IX, KASE, KASE1
-      REAL               AINVNM, SCALE, SL, SMLNUM, SU
+      REAL               AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL
 *     ..
 *     .. Local Arrays ..
       INTEGER            ISAVE( 3 )
@@ -165,6 +172,8 @@
       INTRINSIC          ABS, MAX
 *     ..
 *     .. Executable Statements ..
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
 *
 *     Test the input parameters.
 *
@@ -176,7 +185,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( ANORM.LT.ZERO .OR. SISNAN( ANORM ) ) THEN
+      ELSE IF( ANORM.LT.ZERO ) THEN
          INFO = -5
       END IF
       IF( INFO.NE.0 ) THEN
@@ -192,6 +201,13 @@
          RETURN
       ELSE IF( ANORM.EQ.ZERO ) THEN
          RETURN
+      ELSE IF( SISNAN( ANORM ) ) THEN
+         RCOND = ANORM
+         INFO = -5
+         RETURN
+      ELSE IF( ANORM.GT.HUGEVAL ) THEN
+         INFO = -5
+         RETURN
       END IF
 *
       SMLNUM = SLAMCH( 'Safe minimum' )
@@ -248,8 +264,17 @@
 *
 *     Compute the estimate of the reciprocal condition number.
 *
-      IF( AINVNM.NE.ZERO )
-     $   RCOND = ( ONE / AINVNM ) / ANORM
+      IF( AINVNM.NE.ZERO ) THEN
+         RCOND = ( ONE / AINVNM ) / ANORM
+      ELSE
+         INFO = 1
+         RETURN
+      END IF
+*
+*     Check for NaNs and Infs
+*
+      IF( SISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL )
+     $   INFO = 1
 *
    20 CONTINUE
       RETURN
diff --git a/lapack-netlib/SRC/sgedmd.c b/lapack-netlib/SRC/sgedmd.c
new file mode 100644
index 000000000..c8f3a5964
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmd.c
@@ -0,0 +1,1746 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+static integer c__2 = 2;
+
+/* Subroutine */ int sgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, real *x, integer *ldx, real *
+	y, integer *ldy, integer *nrnk, real *tol, integer *k, real *reig, 
+	real *imeig, real *z__, integer *ldz, real *res, real *b, integer *
+	ldb, real *w, integer *ldw, real *s, integer *lds, real *work, 
+	integer *lwork, integer *iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    real zero, ssum;
+    integer info1, info2;
+    real xscl1, xscl2;
+    extern real snrm2_(integer *, real *, integer *);
+    integer i__, j;
+    real scale;
+    extern logical lsame_(char *, char *);
+    extern /* Subroutine */ int sscal_(integer *, real *, real *, integer *);
+    logical badxy;
+    real small;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    real *, integer *), sgeev_(char *, char *, 
+	    integer *, real *, integer *, real *, real *, real *, integer *, 
+	    real *, integer *, real *, integer *, integer *);
+    char jobzl[1];
+    extern /* Subroutine */ int saxpy_(integer *, real *, real *, integer *, 
+	    real *, integer *);
+    logical wntex;
+    real ab[4]	/* was [2][2] */;
+    extern real slamch_(char *), slange_(char *, integer *, integer *,
+	     real *, integer *, real *);
+    extern /* Subroutine */ int sgesdd_(char *, integer *, integer *, real *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    integer *, integer *, integer *), xerbla_(char *, integer 
+	    *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int slascl_(char *, integer *, integer *, real *, 
+	    real *, integer *, integer *, real *, integer *, integer *);
+    extern integer isamax_(integer *, real *, integer *);
+    logical sccolx, sccoly;
+    extern logical sisnan_(real *);
+    extern /* Subroutine */ int sgesvd_(char *, char *, integer *, integer *, 
+	    real *, integer *, real *, real *, integer *, real *, integer *, 
+	    real *, integer *, integer *);
+    integer lwrsdd, mwrsdd;
+    extern /* Subroutine */ int sgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, real *, integer *, real *, real *,
+	     integer *, real *, integer *, real *, integer *, integer *, 
+	    integer *), 
+	    slacpy_(char *, integer *, integer *, real *, integer *, real *, 
+	    integer *);
+    integer iminwr;
+    logical wntref, wntvec;
+    real rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork;
+    real rdummy[2];
+    integer lwrsvd, mwrsvd;
+    logical lquery, wntres;
+    char jsvopt[1];
+    extern /* Subroutine */ int slassq_(integer *, real *, integer *, real *, 
+	    real *), mecago_();
+    integer mwrsvj, lwrsvq, mwrsvq;
+    real rdummy2[2], ofl, one;
+    extern /* Subroutine */ int sgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, real *, integer *, real *, real *, 
+	    integer *, real *, integer *, integer *, integer *, integer *, 
+	    real *, integer *, real *, integer *, integer *);
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     SGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, SGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, SGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: SGESVD (the QR SVD algorithm) */
+/*     2 :: SGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*     < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, Y contains the data snapshot matrix Y */
+/*     < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, and Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of IMEIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consecutive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, and Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*        || Z(:,i:i+1)||_F = 1. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of X(:,1:K)*W(1:K,1:K), where the columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. The columns of W(1:K,1:K) */
+/*     are similarly structured: If IMEIG(i) == 0 then */
+/*     X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0 */
+/*     then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and */
+/*          X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of REIG, IMEIG and Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient (real and */
+/*     imaginary parts for each complex conjugate pair of the */
+/*     eigenvalues). The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     left singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) REAL(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by SGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, WORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain */
+/*     scaling factor WORK(N+2)/WORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to SGEDMD is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     If WHTSVD == 1 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)). */
+/*        If JOBZ == 'N'  then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)). */
+/*        Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal */
+/*        workspace length of SGESVD. */
+/*     If WHTSVD == 2 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the */
+/*        minimal workspace length of SGESDD. */
+/*     If WHTSVD == 3 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = N+M+MAX(3*N+1, */
+/*                        MAX(1,3*N+M,5*N),MAX(1,N)) */
+/*        is the minimal workspace length of SGESVDQ. */
+/*     If WHTSVD == 4 :: */
+/*        If JOBZ == 'V', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N)) */
+/*        If JOBZ == 'N', then */
+/*        LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N)) */
+/*        Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the */
+/*        minimal workspace length of SGEJSV. */
+/*     The above expressions are not simplified in order to */
+/*     make the usage of WORK more transparent, and for */
+/*     easier checking. In any case, LWORK >= 2. */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -18;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -21;
+    } else if (*ldw < *n) {
+	*info = -23;
+    } else if (*lds < *n) {
+	*info = -25;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.f;
+		work[2] = 2.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwork = f2cmax(2,*n);
+	olwork = f2cmax(2,*n);
+	iminwr = 1;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of SGESVD: */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = f2cmin(*m,*n) * 3 + f2cmax(*m,*n), i__1 = f2cmax(i__1,
+		    i__2), i__2 = f2cmin(*m,*n) * 5;
+	    mwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvd;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, rdummy, &c_n1, &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsvd, i__2 = (integer) rdummy[0];
+		lwrsvd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of SGESDD: */
+/* MWRSDD = 3*MIN(M,N)*MIN(M,N) + */
+/* MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) ) */
+/* IMINWR = 8*MIN(M,N) */
+/* Computing MAX */
+	    i__1 = f2cmax(*m,*n), i__2 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + (f2cmin(*m,*
+		    n) << 2);
+	    mwrsdd = f2cmin(*m,*n) * 3 * f2cmin(*m,*n) + f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsdd;
+	    mlwork = f2cmax(i__1,i__2);
+	    iminwr = f2cmin(*m,*n) << 3;
+	    if (lquery) {
+		sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+			ldb, &w[w_offset], ldw, rdummy, &c_n1, &iwork[1], &
+			info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) rdummy[0];
+		lwrsdd = f2cmax(i__1,i__2);
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsdd;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 3) {
+/* LWQP3 = 3*N+1 */
+/* LWORQ = MAX(N, 1) */
+/* MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N)) */
+/* MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2) */
+/* MLWORK = N + MWRSVQ */
+/* IMINWR = M+N-1 */
+	    sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], &c_n1, rdummy, &c_n1, rdummy2, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) rdummy[1];
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvq + (integer) rdummy2[0];
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvq = (integer) rdummy[0];
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + lwrsvq + (integer) rdummy2[0];
+		olwork = f2cmax(i__1,i__2);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+/* MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V' */
+/* Computing MAX */
+	    i__1 = 7, i__2 = (*m << 1) + *n, i__1 = f2cmax(i__1,i__2), i__2 = (*
+		    n << 2) + *n * *n, i__1 = f2cmax(i__1,i__2), i__2 = (*n << 1)
+		     + *n * *n + 6;
+	    mwrsvj = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = *n + mwrsvj;
+	    mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = 3, i__2 = *m + *n * 3;
+	    iminwr = f2cmax(i__1,i__2);
+	    if (lquery) {
+/* Computing MAX */
+		i__1 = olwork, i__2 = *n + mwrsvj;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+/*         END SELECT */
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the SGEEV call */
+	if (lsame_(jobzl, "V")) {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n << 2;
+	    mwrkev = f2cmax(i__1,i__2);
+	} else {
+/* Computing MAX */
+	    i__1 = 1, i__2 = *n * 3;
+	    mwrkev = f2cmax(i__1,i__2);
+	}
+/* Computing MAX */
+	i__1 = mlwork, i__2 = *n + mwrkev;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    sgeev_("N", jobzl, n, &s[s_offset], lds, &reig[1], &imeig[1], &w[
+		    w_offset], ldw, &w[w_offset], ldw, rdummy, &c_n1, &info1);
+/* Computing MAX */
+	    i__1 = mwrkev, i__2 = (integer) rdummy[0];
+	    lwrkev = f2cmax(i__1,i__2);
+/* Computing MAX */
+	    i__1 = olwork, i__2 = *n + lwrkev;
+	    olwork = f2cmax(i__1,i__2);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -29;
+	}
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -27;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (real) mlwork;
+	work[2] = (real) olwork;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = slamch_("O");
+    small = slamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using SLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    slassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("SGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &x[
+			    i__ * x_dim1 + 1], m, &info2);
+/*                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("SGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (work[i__] > zero) {
+		r__1 = one / work[i__];
+		sscal_(m, &r__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		r__1 = -work[i__];
+		r__2 = one / (real) (*m);
+		slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], m, &info2);
+/* LAPACK CA */
+	    } else if (y[isamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ * 
+		    y_dim1] != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    sscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using SLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    slassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (sisnan_(&scale) || sisnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("SGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as WORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    r__1 = one / rootsc;
+		    slascl_("G", &c__0, &c__0, &scale, &r__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+		    work[i__] = -scale * (rootsc / (real) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    work[i__] = scale * rootsc;
+		    slascl_("G", &c__0, &c__0, &work[i__], &one, m, &c__1, &y[
+			    i__ * y_dim1 + 1], m, &info2);
+/*                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPACK */
+		}
+	    } else {
+		work[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (work[i__] > zero) {
+		r__1 = one / work[i__];
+		sscal_(m, &r__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (work[i__] < zero) {
+		r__1 = -work[i__];
+		r__2 = one / (real) (*m);
+		slascl_("G", &c__0, &c__0, &r__1, &r__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], m, &info2);
+/* LAPACK CA */
+	    } else if (x[isamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ * 
+		    x_dim1] != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	i__1 = *lwork - *n;
+	sgesvd_("O", "S", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 2) {
+	i__1 = *lwork - *n;
+	sgesdd_("O", m, n, &x[x_offset], ldx, &work[1], &b[b_offset], ldb, &w[
+		w_offset], ldw, &work[*n + 1], &i__1, &iwork[1], &info1);
+/* LAPACK CAL */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 3) {
+	i__1 = *lwork - *n - f2cmax(2,*m);
+	i__2 = f2cmax(2,*m);
+	sgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &work[1], &
+		z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &work[*n + f2cmax(2,*m) + 1], &i__1, &work[*n + 1], &
+		i__2, &info1);
+
+	slacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'T';
+    } else if (*whtsvd == 4) {
+	i__1 = *lwork - *n;
+	sgejsv_("F", "U", jsvopt, "N", "N", "P", m, n, &x[x_offset], ldx, &
+		work[1], &z__[z_offset], ldz, &w[w_offset], ldw, &work[*n + 1]
+		, &i__1, &iwork[1], &info1);
+/* LAPACK CALL */
+	slacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = work[*n + 1];
+	xscl2 = work[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case SGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    slascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (work[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("SGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= work[1] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (work[i__ + 1] <= work[i__] * *tol || work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (work[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^T * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^T is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    r__1 = one / work[i__];
+	    sscal_(n, &r__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that SGESVD, SGESVDQ and SGESDD return the */
+/* transposed matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    work[*n + i__] = one / work[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		w[i__ + j * w_dim1] = work[*n + i__] * w[i__ + j * w_dim1];
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside SGEDMD). */
+	sgemm_("N", t_or_n__, m, k, n, &one, &y[y_offset], ldy, &w[w_offset], 
+		ldw, &zero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRI */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRI */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLAS */
+	slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	sgemm_("T", "N", k, k, m, &one, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRI */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* BLAS */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	sgemm_("T", "N", k, n, m, &one, &x[x_offset], ldx, &y[y_offset], ldy, 
+		&zero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! IN */
+/* In the two SGEMM calls here, can use K for LDZ */
+/* B */
+	sgemm_("N", t_or_n__, k, k, n, &one, &z__[z_offset], ldz, &w[w_offset]
+		, ldw, &zero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRIN */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRIN */
+/* At this point S = U^T * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^T is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		slacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		slacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    i__1 = *lwork - *n;
+    sgeev_("N", jobzl, k, &s[s_offset], lds, &reig[1], &imeig[1], &w[w_offset]
+	    , ldw, &w[w_offset], ldw, &work[*n + 1], &i__1, &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient. Even in the case of complex spectrum, all */
+/* computation is done in real arithmetic. REIG and */
+/* IMEIG are the real and the imaginary parts of the */
+/* eigenvalues, so that the spectrum is given as */
+/* REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs */
+/* are listed at consecutive positions. For such a */
+/* complex conjugate pair of the eigenvalues, the */
+/* corresponding eigenvectors are also a complex */
+/* conjugate pair with the real and imaginary parts */
+/* stored column-wise in W at the corresponding */
+/* consecutive column indices. See the description of Z. */
+/* Also, see the description of SGEEV. */
+/* LAPACK C */
+    if (info1 > 0) {
+/* SGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		sgemm_("N", "N", m, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+		sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[
+			s_offset], lds, &zero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		slacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    slacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    sgemm_(t_or_n__, "N", n, k, k, &one, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    sgemm_("N", "N", m, k, n, &one, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, & */
+/*           LDS, ZERO, Z, LDZ) */
+/* Save a copy of Z into B and free Z for holding */
+/* the Ritz vectors. */
+/* CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the real form of the Ritz vectors */
+	if (wntvec) {
+	    sgemm_("N", "N", m, k, k, &one, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__ = 1;
+	    while(i__ <= *k) {
+		if (imeig[i__] == zero) {
+/* have a real eigenvalue with real eigenvector */
+		    r__1 = -reig[i__];
+		    saxpy_(m, &r__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			    y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! */
+
+		    res[i__] = snrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+		    ++i__;
+		} else {
+/* Have a complex conjugate pair */
+/* REIG(i) +- sqrt(-1)*IMEIG(i). */
+/* Since all computation is done in real */
+/* arithmetic, the formula for the residual */
+/* is recast for real representation of the */
+/* complex conjugate eigenpair. See the */
+/* description of RES. */
+		    ab[0] = reig[i__];
+		    ab[1] = -imeig[i__];
+		    ab[2] = imeig[i__];
+		    ab[3] = reig[i__];
+		    r__1 = -one;
+		    sgemm_("N", "N", m, &c__2, &c__2, &r__1, &z__[i__ * 
+			    z_dim1 + 1], ldz, ab, &c__2, &one, &y[i__ * 
+			    y_dim1 + 1], ldy);
+/* Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INT */
+/* BL */
+		    res[i__] = slange_("F", m, &c__2, &y[i__ * y_dim1 + 1], 
+			    ldy, &work[*n + 1]);
+/* LA */
+		    res[i__ + 1] = res[i__];
+		    i__ += 2;
+		}
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	work[*n + 1] = xscl1;
+	work[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* sgedmd_ */
+
diff --git a/lapack-netlib/SRC/sgedmd.f90 b/lapack-netlib/SRC/sgedmd.f90
new file mode 100644
index 000000000..4860e8898
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmd.f90
@@ -0,0 +1,1206 @@
+!> \brief \b SGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!     SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+!                        M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+!                        K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+!                        B, LDB, W,  LDW,   S, LDS,        &
+!                        WORK, LWORK, IWORK, LIWORK, INFO )
+!.....
+!     USE                   iso_fortran_env
+!     IMPLICIT NONE
+!     INTEGER, PARAMETER :: WP = real32
+!.....
+!     Scalar arguments
+!     CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!     INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                NRNK, LDZ, LDB, LDW,  LDS, &
+!                                LWORK,  LIWORK
+!     INTEGER,   INTENT(OUT)  :: K, INFO
+!     REAL(KIND=WP), INTENT(IN) ::  TOL
+!     Array arguments
+!     REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                     W(LDW,*), S(LDS,*)
+!     REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+!                                     RES(*)
+!     REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+!     INTEGER,       INTENT(OUT)   :: IWORK(*)
+!
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    SGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, SGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, SGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
+!......................................................................
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Distribution Statement A:
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!============================================================
+!     Arguments
+!     =========
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
+!.....
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: SGESVD (the QR SVD algorithm)
+!>    2 :: SGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
+!.....
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
+!.....
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
+!.....
+!>    \param[in,out] X
+!>    \verbatim
+!>    X (input/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDX
+!>    \verbatim
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
+!.....
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) REAL(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
+!.....
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
+!.....
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1)
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
+!.....
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
+!.....
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
+!.....
+!>    \param[out] REIG
+!>    \verbatim
+!>    REIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of REIG contain
+!>    the real parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    See the descriptions of K, IMEIG, and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] IMEIG
+!>    \verbatim
+!>    IMEIG (output) REAL(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of IMEIG contain
+!>    the imaginary parts of the computed eigenvalues
+!>    REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!>    The eigenvalues are determined as follows:
+!>    If IMEIG(i) == 0, then the corresponding eigenvalue is
+!>    real, LAMBDA(i) = REIG(i).
+!>    If IMEIG(i)>0, then the corresponding complex
+!>    conjugate pair of eigenvalues reads
+!>    LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!>    LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!>    That is, complex conjugate pairs have consecutive
+!>    indices (i,i+1), with the positive imaginary part
+!>    listed first.
+!>    See the descriptions of K, REIG, and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) REAL(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then
+!>       Z contains real Ritz vectors as follows:
+!>       If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!>       the i-th Ritz value; ||Z(:,i)||_2=1.
+!>       If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!>       [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!>       the Ritz values extracted from this subspace are
+!>       REIG(i) + sqrt(-1)*IMEIG(i) and
+!>       REIG(i) - sqrt(-1)*IMEIG(i).
+!>       The corresponding eigenvectors are
+!>       Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!>       Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!>       || Z(:,i:i+1)||_F = 1.
+!>    If JOBZ == 'F', then the above descriptions hold for
+!>    the columns of X(:,1:K)*W(1:K,1:K), where the columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient. The columns of W(1:K,1:K)
+!>    are similarly structured: If IMEIG(i) == 0 then
+!>    X(:,1:K)*W(:,i) is an eigenvector, and if IMEIG(i)>0
+!>    then X(:,1:K)*W(:,i)+sqrt(-1)*X(:,1:K)*W(:,i+1) and
+!>         X(:,1:K)*W(:,i)-sqrt(-1)*X(:,1:K)*W(:,i+1)
+!>    are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!>    See the descriptions of REIG, IMEIG, X and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
+!.....
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs.
+!>    If LAMBDA(i) is real, then
+!>       RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!>    If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!>    then
+!>    RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!>    where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!>              [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!>    It holds that
+!>    RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!>    RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!>    where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!>          ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!>    See the description of REIG, IMEIG and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) REAL(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1;K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
+!.....
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) REAL(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient (real and
+!>    imaginary parts for each complex conjugate pair of the
+!>    eigenvalues). The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    left singular vectors of X.
+!>    \endverbatim
+!.....
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
+!.....
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) REAL(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by SGEEV.
+!>    See the description of K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
+!.....
+!>    \param[out] WORK
+!>    \verbatim
+!>    WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!>    On exit, WORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then WORK(N+1) and WORK(N+2) contain
+!>    scaling factor WORK(N+2)/WORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to SGEDMD is only workspace query, then
+!>    WORK(1) contains the minimal workspace length and
+!>    WORK(2) is the optimal workspace length. Hence, the
+!>    length of work is at least 2.
+!>    See the description of LWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LWORK
+!>    \verbatim
+!>    LWORK (input) INTEGER
+!>    The minimal length of the workspace vector WORK.
+!>    LWORK is calculated as follows:
+!>    If WHTSVD == 1 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N)).
+!>       If JOBZ == 'N'  then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N)).
+!>       Here LWORK_SVD = MAX(1,3*N+M,5*N) is the minimal
+!>       workspace length of SGESVD.
+!>    If WHTSVD == 2 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N + LWORK_SVD, N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(M, 5*N*N+4*N)+3*N*N is the
+!>       minimal workspace length of SGESDD.
+!>    If WHTSVD == 3 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = N+M+MAX(3*N+1,
+!>                       MAX(1,3*N+M,5*N),MAX(1,N))
+!>       is the minimal workspace length of SGESVDQ.
+!>    If WHTSVD == 4 ::
+!>       If JOBZ == 'V', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,4*N))
+!>       If JOBZ == 'N', then
+!>       LWORK >= MAX(2, N+LWORK_SVD,N+MAX(1,3*N))
+!>       Here LWORK_SVD = MAX(7,2*M+N,6*N+2*N*N) is the
+!>       minimal workspace length of SGEJSV.
+!>    The above expressions are not simplified in order to
+!>    make the usage of WORK more transparent, and for
+!>    easier checking. In any case, LWORK >= 2.
+!>    If on entry LWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
+!.....
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for both WORK and
+!>    IWORK. See the descriptions of WORK and IWORK.
+!>    \endverbatim
+!.....
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
+!.............................................................
+!.............................................................
+      SUBROUTINE SGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,  &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,  &
+                         K, REIG,  IMEIG,   Z, LDZ,  RES,  &
+                         B, LDB, W,  LDW,   S, LDS,        &
+                         WORK, LWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LWORK,  LIWORK
+      INTEGER,   INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN) ::  TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      REAL(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      REAL(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                      W(LDW,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*), &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,   &
+                       SSUM,  XSCL1,  XSCL2
+      INTEGER       ::  i,  j, IMINWR,  INFO1, INFO2,  &
+                       LWRKEV, LWRSDD, LWRSVD, &
+                       LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                       MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                       OLWORK
+      LOGICAL       ::  BADXY, LQUERY, SCCOLX, SCCOLY, &
+                        WNTEX, WNTREF, WNTRES, WNTVEC
+      CHARACTER     ::  JOBZL, T_OR_N
+      CHARACTER     ::  JSVOPT
+!
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: AB(2,2), RDUMMY(2), RDUMMY2(2)
+!
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) SLANGE, SLAMCH, SNRM2
+      EXTERNAL      SLANGE, SLAMCH, SNRM2, ISAMAX
+      INTEGER       ISAMAX
+      LOGICAL       SISNAN, LSAME
+      EXTERNAL      SISNAN, LSAME
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      SAXPY,  SGEMM,  SSCAL
+      EXTERNAL      SGEEV,  SGEJSV, SGESDD, SGESVD, SGESVDQ, &
+                    SLACPY, SLASCL, SLASSQ, XERBLA
+!
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     INT, FLOAT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -18
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -21
+      ELSE IF ( LDW < N ) THEN
+          INFO = -23
+      ELSE IF ( LDS < N ) THEN
+          INFO = -25
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                WORK(1)  = 2
+                WORK(2)  = 2
+            ELSE
+               K = 0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+         MLWORK = MAX(2,N)
+         OLWORK = MAX(2,N)
+         IMINWR = 1
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of SGESVD:
+             ! MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             MLWORK = MAX(MLWORK,N + MWRSVD)
+             IF ( LQUERY ) THEN
+                CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, &
+                           B, LDB, W, LDW, RDUMMY, -1, INFO1 )
+                LWRSVD = MAX( MWRSVD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of SGESDD:
+             ! MWRSDD = 3*MIN(M,N)*MIN(M,N) +
+             ! MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 3*MIN(M,N)*MIN(M,N) +                &
+              MAX( MAX(M,N),5*MIN(M,N)*MIN(M,N)+4*MIN(M,N) )
+             MLWORK = MAX(MLWORK,N + MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             IF ( LQUERY ) THEN
+                CALL SGESDD( 'O', M, N, X, LDX, WORK, B,     &
+                     LDB, W, LDW, RDUMMY, -1, IWORK, INFO1 )
+                LWRSDD = MAX( MWRSDD, INT( RDUMMY(1) ) )
+                OLWORK = MAX(OLWORK,N + LWRSDD)
+             END IF
+         CASE (3)
+             !LWQP3 = 3*N+1
+             !LWORQ = MAX(N, 1)
+             !MWRSVD = MAX(1,3*MIN(M,N)+MAX(M,N),5*MIN(M,N))
+             !MWRSVQ = N + MAX( LWQP3, MWRSVD, LWORQ )+ MAX(M,2)
+             !MLWORK = N + MWRSVQ
+             !IMINWR = M+N-1
+             CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                          X, LDX, WORK, Z, LDZ, W, LDW,   &
+                             NUMRNK, IWORK, -1, RDUMMY,   &
+                             -1, RDUMMY2, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(RDUMMY(2))
+             MLWORK = MAX(MLWORK,N+MWRSVQ+INT(RDUMMY2(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = INT(RDUMMY(1))
+                OLWORK = MAX(OLWORK,N+LWRSVQ+INT(RDUMMY2(1)))
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             !MWRSVJ = MAX( 7, 2*M+N, 6*N+2*N*N )! for JSVOPT='V'
+             MWRSVJ = MAX( 7, 2*M+N, 4*N+N*N, 2*N+N*N+6 )
+             MLWORK = MAX(MLWORK,N+MWRSVJ)
+             IMINWR = MAX( 3, M+3*N )
+             IF ( LQUERY ) THEN
+                OLWORK = MAX(OLWORK,N+MWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the SGEEV call
+         IF ( LSAME(JOBZL,'V') ) THEN
+             MWRKEV = MAX( 1, 4*N )
+         ELSE
+             MWRKEV = MAX( 1, 3*N )
+         END IF
+         MLWORK = MAX(MLWORK,N+MWRKEV)
+         IF ( LQUERY ) THEN
+                CALL SGEEV( 'N', JOBZL, N, S, LDS, REIG, &
+                    IMEIG, W, LDW, W, LDW, RDUMMY, -1, INFO1 )
+                LWRKEV = MAX( MWRKEV, INT(RDUMMY(1)) )
+                OLWORK = MAX( OLWORK, N+LWRKEV )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -29
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -27
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'SGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = SLAMCH('O')
+      SMALL = SLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using SLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, X(1,i), 1 )
+            SSUM  = ONE
+            SCALE = ZERO
+            CALL SLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('SGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               X(1,i), M, INFO2 )              ! LAPACK CALL
+!                 X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+          ! All columns of X are zero. Return error code -8.
+          ! (the 8th input variable had an illegal value)
+          K = 0
+          INFO = -8
+          CALL XERBLA('SGEDMD',-INFO)
+          RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL SSCAL( M, ONE/WORK(i), Y(1,i), 1 )  ! BLAS CALL
+!               Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL SLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/FLOAT(M), M, 1, Y(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( Y(ISAMAX(M, Y(1,i),1),i )  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL SSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using SLASSQ.
+          DO i = 1, N
+            !WORK(i) = DNRM2( M, Y(1,i), 1 )
+            SSUM  = ONE
+            SCALE = ZERO
+            CALL SLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( SISNAN(SCALE) .OR. SISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('SGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as WORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL SLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), M, INFO2 )
+                  WORK(i) = - SCALE * ( ROOTSC / FLOAT(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  WORK(i) =   SCALE * ROOTSC
+                  CALL SLASCL( 'G',0, 0, WORK(i), ONE, M, 1, &
+                               Y(1,i), M, INFO2 )              ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/WORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               WORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( WORK(i) >  ZERO ) THEN
+                CALL SSCAL( M, ONE/WORK(i), X(1,i), 1 )  ! BLAS CALL
+!               X(1:M,i) = (ONE/WORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( WORK(i) < ZERO ) THEN
+                CALL SLASCL( 'G', 0, 0, -WORK(i),          &
+                     ONE/FLOAT(M), M, 1, X(1,i), M, INFO2 ) ! LAPACK CALL
+            ELSE IF ( X(ISAMAX(M, X(1,i),1),i )  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL SGESVD( 'O', 'S', M, N, X, LDX, WORK, B, &
+                  LDB, W, LDW, WORK(N+1), LWORK-N, INFO1 ) ! LAPACK CALL
+             T_OR_N = 'T'
+         CASE (2)
+            CALL SGESDD( 'O', M, N, X, LDX, WORK, B, LDB, W, &
+                 LDW, WORK(N+1), LWORK-N, IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'T'
+         CASE (3)
+              CALL SGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, WORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, WORK(N+MAX(2,M)+1),&
+                   LWORK-N-MAX(2,M), WORK(N+1), MAX(2,M), INFO1)     ! LAPACK CALL
+              CALL SLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'T'
+         CASE (4)
+              CALL SGEJSV( 'F', 'U', JSVOPT, 'N', 'N', 'P', M, &
+                   N, X, LDX, WORK, Z, LDZ, W, LDW, &
+                   WORK(N+1), LWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL SLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = WORK(N+1)
+              XSCL2 = WORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case SGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL SLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( WORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('SGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( WORK(i) <= WORK(1)*TOL ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( WORK(i+1) <= WORK(i)*TOL  ) .OR. &
+                      ( WORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( WORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^T * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^T is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL SSCAL( N, ONE/WORK(i), W(1,i), 1 )    ! BLAS CALL
+           ! W(1:N,i) = (ONE/WORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that SGESVD, SGESVDQ and SGESDD return the
+          ! transposed matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL SSCAL( N, ONE/WORK(i), W(i,1), LDW )    ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/WORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              WORK(N+i) = ONE/WORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = (WORK(N+i))*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside SGEDMD).
+          CALL SGEMM( 'N', T_OR_N, M, K, N, ONE, Y, LDY, W, &
+                      LDW, ZERO, Z, LDZ )                        ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(W(1:K,1:N)))  ! INTRINSIC, for T_OR_N=='T'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))             ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL SGEMM( 'T', 'N', K, K, M, ONE, X, LDX, Z, &
+                      LDZ, ZERO, S, LDS )                        ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TANSPOSE(X(1:M,1:K)),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^T * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL SGEMM( 'T', 'N', K, N, M, ONE, X, LDX, Y, LDY, &
+                   ZERO, Z, LDZ )                                   ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(X(1:M,1:K)), Y(1:M,1:N) )  ! INTRINSIC
+        ! In the two SGEMM calls here, can use K for LDZ
+        CALL SGEMM( 'N', T_OR_N, K, K, N, ONE, Z, LDZ, W, &
+                    LDW, ZERO, S, LDS )                         ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(W(1:K,1:N))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))          ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^T * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^T is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL SLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL SLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL SGEEV( 'N', JOBZL, K, S, LDS, REIG, IMEIG, W, &
+                  LDW, W, LDW, WORK(N+1), LWORK-N, INFO1 )   ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient. Even in the case of complex spectrum, all
+      ! computation is done in real arithmetic. REIG and
+      ! IMEIG are the real and the imaginary parts of the
+      ! eigenvalues, so that the spectrum is given as
+      ! REIG(:) + sqrt(-1)*IMEIG(:). Complex conjugate pairs
+      ! are listed at consecutive positions. For such a
+      ! complex conjugate pair of the eigenvalues, the
+      ! corresponding eigenvectors are also a complex
+      ! conjugate pair with the real and imaginary parts
+      ! stored column-wise in W at the corresponding
+      ! consecutive column indices. See the description of Z.
+      ! Also, see the description of SGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! SGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+      IF ( WNTRES ) THEN
+          IF ( WNTREF ) THEN
+            ! Here, if the refinement is requested, we have
+            ! A*U(:,1:K) already computed and stored in Z.
+            ! For the residuals, need Y = A * U(:,1;K) * W.
+            CALL SGEMM( 'N', 'N', M, K, K, ONE, Z, LDZ, W, &
+                       LDW, ZERO, Y, LDY )               ! BLAS CALL
+            ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)       ! INTRINSIC
+            ! This frees Z; Y contains A * U(:,1:K) * W.
+          ELSE
+            ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, Z, LDZ )
+            ! Save a copy of Z into Y and free Z for holding
+            ! the Ritz vectors.
+            CALL SLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+            IF ( WNTEX ) CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+      ELSE IF ( WNTEX ) THEN
+          ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL SGEMM( T_OR_N, 'N', N, K, K, ONE, Z, LDZ, &
+                       W, LDW, ZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+                       LDS, ZERO, B, LDB )
+            ! The above call replaces the following two calls
+            ! that were used in the developing-testing phase.
+            ! CALL SGEMM( 'N', 'N', M, K, N, ONE, Y, LDY, S, &
+            !           LDS, ZERO, Z, LDZ)
+            ! Save a copy of Z into B and free Z for holding
+            ! the Ritz vectors.
+            ! CALL SLACPY( 'A', M, K, Z, LDZ, B, LDB )
+      END IF
+!
+      ! Compute the real form of the Ritz vectors
+      IF ( WNTVEC ) CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                   ZERO, Z, LDZ )                           ! BLAS CALL
+      ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+      IF ( WNTRES ) THEN
+         i = 1
+         DO WHILE ( i <= K )
+            IF ( IMEIG(i) == ZERO ) THEN
+                ! have a real eigenvalue with real eigenvector
+                CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+                ! Y(1:M,i) = Y(1:M,i) - REIG(i) * Z(1:M,i)            ! INTRINSIC
+                RES(i) = SNRM2( M, Y(1,i), 1 )                         ! BLAS CALL
+                i = i + 1
+            ELSE
+               ! Have a complex conjugate pair
+               ! REIG(i) +- sqrt(-1)*IMEIG(i).
+               ! Since all computation is done in real
+               ! arithmetic, the formula for the residual
+               ! is recast for real representation of the
+               ! complex conjugate eigenpair. See the
+               ! description of RES.
+               AB(1,1) =  REIG(i)
+               AB(2,1) = -IMEIG(i)
+               AB(1,2) =  IMEIG(i)
+               AB(2,2) =  REIG(i)
+               CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                           LDZ, AB, 2, ONE, Y(1,i), LDY )          ! BLAS CALL
+               ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+               RES(i)   = SLANGE( 'F', M, 2, Y(1,i), LDY, &
+                                  WORK(N+1) )                      ! LAPACK CALL
+               RES(i+1) = RES(i)
+               i = i + 2
+            END IF
+         END DO
+      END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          WORK(N+1) = XSCL1
+          WORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE SGEDMD
+
diff --git a/lapack-netlib/SRC/sgedmdq.c b/lapack-netlib/SRC/sgedmdq.c
new file mode 100644
index 000000000..0adf3bda3
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmdq.c
@@ -0,0 +1,1296 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int sgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, real 
+	*f, integer *ldf, real *x, integer *ldx, real *y, integer *ldy, 
+	integer *nrnk, real *tol, integer *k, real *reig, real *imeig, real *
+	z__, integer *ldz, real *res, real *b, integer *ldb, real *v, integer 
+	*ldv, real *s, integer *lds, real *work, integer *lwork, integer *
+	iwork, integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    real zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    extern /* Subroutine */ int sgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, real *, integer *, real *, 
+	    integer *, integer *, real *, integer *, real *, real *, real *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    integer *, real *, integer *, integer *, integer *, integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    extern /* Subroutine */ int sgeqrf_(integer *, integer *, real *, integer 
+	    *, real *, real *, integer *, integer *);
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int slacpy_(char *, integer *, integer *, real *, 
+	    integer *, real *, integer *), slaset_(char *, integer *, 
+	    integer *, real *, real *, real *, integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork;
+    real rdummy[2];
+    integer mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    extern /* Subroutine */ int sorgqr_(integer *, integer *, integer *, real 
+	    *, integer *, real *, real *, integer *, integer *), sormqr_(char 
+	    *, char *, integer *, integer *, integer *, real *, integer *, 
+	    real *, real *, integer *, real *, integer *, integer *);
+    real one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real32 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     SGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, SGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, SGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretized operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the initial QR factorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     orthogonal matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: SGESVD (the QR SVD algorithm) */
+/*     2 :: SGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F) */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) REAL(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by SGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in WORK(1:N). */
+/*     See the description of WORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K)of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X */
+/* ..... */
+/*     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     REIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N) entries of REIG contain */
+/*     the real parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     See the descriptions of K, IMEIG, Z. */
+/* ..... */
+/*     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<N) entries of REIG contain */
+/*     the imaginary parts of the computed eigenvalues */
+/*     REIG(1:K) + sqrt(-1)*IMEIG(1:K). */
+/*     The eigenvalues are determined as follows: */
+/*     If IMEIG(i) == 0, then the corresponding eigenvalue is */
+/*     real, LAMBDA(i) = REIG(i). */
+/*     If IMEIG(i)>0, then the corresponding complex */
+/*     conjugate pair of eigenvalues reads */
+/*     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i) */
+/*     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i) */
+/*     That is, complex conjugate pairs have consecutive */
+/*     indices (i,i+1), with the positive imaginary part */
+/*     listed first. */
+/*     See the descriptions of K, REIG, Z. */
+/* ..... */
+/*     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then */
+/*        Z contains real Ritz vectors as follows: */
+/*        If IMEIG(i)=0, then Z(:,i) is an eigenvector of */
+/*        the i-th Ritz value. */
+/*        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then */
+/*        [Z(:,i) Z(:,i+1)] span an invariant subspace and */
+/*        the Ritz values extracted from this subspace are */
+/*        REIG(i) + sqrt(-1)*IMEIG(i) and */
+/*        REIG(i) - sqrt(-1)*IMEIG(i). */
+/*        The corresponding eigenvectors are */
+/*        Z(:,i) + sqrt(-1)*Z(:,i+1) and */
+/*        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively. */
+/*     If JOBZ == 'F', then the above descriptions hold for */
+/*     the columns of Z*V, where the columns of V are the */
+/*     eigenvectors of the K-by-K Rayleigh quotient, and Z is */
+/*     orthonormal. The columns of V are similarly structured: */
+/*     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if */
+/*     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and */
+/*                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1) */
+/*     are the eigenvectors of LAMBDA(i), LAMBDA(i+1). */
+/*     See the descriptions of REIG, IMEIG, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs. */
+/*     If LAMBDA(i) is real, then */
+/*        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2. */
+/*     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair */
+/*     then */
+/*     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F */
+/*     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ] */
+/*               [-imag(LAMBDA(i)) real(LAMBDA(i)) ]. */
+/*     It holds that */
+/*     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2 */
+/*     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2 */
+/*     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1) */
+/*           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1) */
+/*     See the description of Z. */
+/* ..... */
+/*     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) contains the K eigenvectors of */
+/*     the Rayleigh quotient. The eigenvectors of a complex */
+/*     conjugate pair of eigenvalues are returned in real form */
+/*     as explained in the description of Z. The Ritz vectors */
+/*     (returned in Z) are the product of X and V; see */
+/*     the descriptions of X and Z. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by SGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by SGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to SGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. Hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for SGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for SGEDMD (see the */
+/*                  description of LWORK in SGEDMD) for */
+/*                  snapshots of dimensions MIN(M,N)-by-(N-1) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    SORMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for SORGQR[M,N,N]) */
+/*     Then */
+/*     LWORK = MAX(N+MLWQR, N+MLWDMD) */
+/*     is updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR ) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR) */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */
+/*     If on entry LIWORK = -1, then a worskpace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local array */
+/*     ~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --reig;
+    --imeig;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+/* .......................................................... */
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -22;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -25;
+    } else if (*ldv < *n - 1) {
+	*info = -27;
+    } else if (*lds < *n - 1) {
+	*info = -29;
+    }
+
+    if (wntvec || wntvcf) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		work[1] = 2.f;
+		work[2] = 2.f;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for SGEQRF. */
+	mlwork = f2cmin(*m,*n) + mlwqr;
+	if (lquery) {
+	    sgeqrf_(m, n, &f[f_offset], ldf, &work[1], rdummy, &c_n1, &info1);
+	    olwqr = (integer) rdummy[0];
+	    olwork = f2cmin(*m,*n) + olwqr;
+	}
+	i__1 = *n - 1;
+	sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &
+		z__[z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], 
+		ldv, &s[s_offset], lds, &work[1], &c_n1, &iwork[1], liwork, &
+		info1);
+	mlwdmd = (integer) work[1];
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+	iminwr = iwork[1];
+	if (lquery) {
+	    olwdmd = (integer) work[2];
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		sormqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &work[1], &
+			z__[z_offset], ldz, &work[1], &c_n1, &info1);
+		olwmqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = *n;
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + *n - 1 + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[
+			1], &c_n1, &info1);
+		olwgqr = (integer) work[1];
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + *n - 1 + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	iminwr = f2cmax(1,iminwr);
+	mlwork = f2cmax(2,mlwork);
+	if (*lwork < mlwork && ! lquery) {
+	    *info = -31;
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -33;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	work[1] = (real) mlwork;
+	work[2] = (real) olwork;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >>N , at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lwork - minmn;
+    sgeqrf_(m, n, &f[f_offset], ldf, &work[1], &work[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    slaset_("L", &minmn, &i__1, &zero, &zero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    slacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    slacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	slaset_("L", &i__1, &i__2, &zero, &zero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lwork - minmn;
+    sgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &reig[1], &imeig[1], &z__[
+	    z_offset], ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+	    s_offset], lds, &work[minmn + 1], &i__2, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    slaset_("A", &i__1, k, &zero, &zero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by SGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by SGEDMD. */
+	slacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    slaset_("A", &i__1, k, &zero, &zero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lwork - (minmn + *n - 1);
+	sormqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &work[1], &z__[
+		z_offset], ldz, &work[minmn + *n], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to SGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	slaset_("A", &minmn, n, &zero, &zero, &y[y_offset], ldy);
+	slacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/orthogonal factor in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lwork - (minmn + *n - 1);
+	sorgqr_(m, &minmn, &minmn, &f[f_offset], ldf, &work[1], &work[minmn + 
+		*n], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* sgedmdq_ */
+
diff --git a/lapack-netlib/SRC/sgedmdq.f90 b/lapack-netlib/SRC/sgedmdq.f90
new file mode 100644
index 000000000..acd5d56c6
--- /dev/null
+++ b/lapack-netlib/SRC/sgedmdq.f90
@@ -0,0 +1,703 @@
+SUBROUTINE SGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  REIG, IMEIG,  &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, WORK, LWORK, IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env 
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real32     
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LWORK,  LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,   K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      REAL(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      REAL(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*),  &
+                                      Z(LDZ,*), B(LDB,*),  &
+                                      V(LDV,*), S(LDS,*)
+      REAL(KIND=WP), INTENT(OUT)   :: REIG(*),  IMEIG(*),  &
+                                      RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     SGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, SGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, SGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretized operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z. 
+!            Q is from the initial QR factorization.  
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     orthogonal matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned.    
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.   
+!     To be useful on exit, this option needs JOBQ='Q'.      
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: SGESVD (the QR SVD algorithm)
+!     2 :: SGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: SGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: SGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F)
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) REAL(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by SGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in WORK(1:N). 
+!     See the description of WORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K)of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X 
+!.....
+!     Y (workspace/output) REAL(KIND=WP) MIN(M,N)-by-(N-1) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.   
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.     
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     REIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N) entries of REIG contain
+!     the real parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     See the descriptions of K, IMEIG, Z.
+!.....
+!     IMEIG (output) REAL(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<N) entries of REIG contain
+!     the imaginary parts of the computed eigenvalues
+!     REIG(1:K) + sqrt(-1)*IMEIG(1:K).
+!     The eigenvalues are determined as follows:
+!     If IMEIG(i) == 0, then the corresponding eigenvalue is
+!     real, LAMBDA(i) = REIG(i).
+!     If IMEIG(i)>0, then the corresponding complex
+!     conjugate pair of eigenvalues reads
+!     LAMBDA(i)   = REIG(i) + sqrt(-1)*IMAG(i)
+!     LAMBDA(i+1) = REIG(i) - sqrt(-1)*IMAG(i)
+!     That is, complex conjugate pairs have consecutive
+!     indices (i,i+1), with the positive imaginary part
+!     listed first.
+!     See the descriptions of K, REIG, Z.     
+!.....      
+!     Z (workspace/output) REAL(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then
+!        Z contains real Ritz vectors as follows:
+!        If IMEIG(i)=0, then Z(:,i) is an eigenvector of
+!        the i-th Ritz value.
+!        If IMEIG(i) > 0 (and IMEIG(i+1) < 0) then
+!        [Z(:,i) Z(:,i+1)] span an invariant subspace and
+!        the Ritz values extracted from this subspace are
+!        REIG(i) + sqrt(-1)*IMEIG(i) and
+!        REIG(i) - sqrt(-1)*IMEIG(i).
+!        The corresponding eigenvectors are
+!        Z(:,i) + sqrt(-1)*Z(:,i+1) and
+!        Z(:,i) - sqrt(-1)*Z(:,i+1), respectively.
+!     If JOBZ == 'F', then the above descriptions hold for
+!     the columns of Z*V, where the columns of V are the
+!     eigenvectors of the K-by-K Rayleigh quotient, and Z is
+!     orthonormal. The columns of V are similarly structured:
+!     If IMEIG(i) == 0 then Z*V(:,i) is an eigenvector, and if 
+!     IMEIG(i) > 0 then Z*V(:,i)+sqrt(-1)*Z*V(:,i+1) and
+!                       Z*V(:,i)-sqrt(-1)*Z*V(:,i+1)
+!     are the eigenvectors of LAMBDA(i), LAMBDA(i+1).
+!     See the descriptions of REIG, IMEIG, X and V.
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs.       
+!     If LAMBDA(i) is real, then
+!        RES(i) = || A * Z(:,i) - LAMBDA(i)*Z(:,i))||_2.
+!     If [LAMBDA(i), LAMBDA(i+1)] is a complex conjugate pair
+!     then
+!     RES(i)=RES(i+1) = || A * Z(:,i:i+1) - Z(:,i:i+1) *B||_F
+!     where B = [ real(LAMBDA(i)) imag(LAMBDA(i)) ]
+!               [-imag(LAMBDA(i)) real(LAMBDA(i)) ].
+!     It holds that
+!     RES(i)   = || A*ZC(:,i)   - LAMBDA(i)  *ZC(:,i)   ||_2
+!     RES(i+1) = || A*ZC(:,i+1) - LAMBDA(i+1)*ZC(:,i+1) ||_2
+!     where ZC(:,i)   =  Z(:,i) + sqrt(-1)*Z(:,i+1)
+!           ZC(:,i+1) =  Z(:,i) - sqrt(-1)*Z(:,i+1)
+!     See the description of Z.
+!.....
+!     B (output) REAL(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization.     
+!     Here A denotes a compression of the underlying operator.      
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....      
+!     V (workspace/output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) contains the K eigenvectors of
+!     the Rayleigh quotient. The eigenvectors of a complex
+!     conjugate pair of eigenvalues are returned in real form
+!     as explained in the description of Z. The Ritz vectors
+!     (returned in Z) are the product of X and V; see
+!     the descriptions of X and Z.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) REAL(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by SGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     WORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by SGEQRF of the 
+!     M-by-N input matrix F.
+!     WORK(MIN(M,N)+1:MIN(M,N)+N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to SGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. Hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for SGEQRF[M,N])
+!         MLWDMD = minimal workspace for SGEDMD (see the
+!                  description of LWORK in SGEDMD) for 
+!                  snapshots of dimensions MIN(M,N)-by-(N-1)
+!         MLWMQR = N (minimal workspace for 
+!                    SORMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for SORGQR[M,N,N])
+!     Then
+!     LWORK = MAX(N+MLWQR, N+MLWDMD)
+!     is updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LWORK = MAX( LWORK,MIN(M,N)+N-1 +MLWMQR )
+!        if   JOBQ == 'Q' THEN
+!             LWORK = MAX( LWORK,MIN(M,N)+N-1+MLWGQR)
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1)
+!     If on entry LIWORK = -1, then a worskpace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MLWDMD, MLWGQR, &
+                           MLWMQR, MLWORK, MLWQR,  MINMN,  & 
+                           OLWDMD, OLWGQR, OLWMQR, OLWORK, &
+                           OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     Local array      
+!     ~~~~~~~~~~~      
+      REAL(KIND=WP) :: RDUMMY(2)
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      SGEMM 
+      EXTERNAL      SGEQRF, SLACPY, SLASET, SORGQR, & 
+                    SORMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      SGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V') 
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q')
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( ( LWORK == -1 ) .OR. ( LIWORK == -1 ) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ       &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+          INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -22
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -25
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -27
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -29
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+                WORK(1) = 2
+                WORK(2) = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF     
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for SGEQRF.
+         MLWORK = MIN(M,N) + MLWQR 
+         IF ( LQUERY ) THEN 
+             CALL SGEQRF( M, N, F, LDF, WORK, RDUMMY, -1, &
+                          INFO1 )
+             OLWQR = INT(RDUMMY(1))
+             OLWORK = MIN(M,N) + OLWQR           
+         END IF
+         CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      REIG, IMEIG, Z, LDZ, RES,  B, LDB,     & 
+                      V, LDV, S, LDS, WORK, -1, IWORK,       &
+                      LIWORK, INFO1 )
+         MLWDMD = INT(WORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         IMINWR = IWORK(1)
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(WORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL SORMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            WORK, Z, LDZ, WORK, -1, INFO1 )
+               OLWMQR = INT(WORK(1))
+               OLWORK = MAX(OLWORK,MINMN+N-1+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = N
+            MLWORK = MAX(MLWORK,MINMN+N-1+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+                             WORK, -1, INFO1 )        
+                OLWGQR = INT(WORK(1))
+                OLWORK = MAX(OLWORK,MINMN+N-1+OLWGQR)
+            END IF            
+         END IF   
+         IMINWR = MAX( 1, IMINWR )
+         MLWORK = MAX( 2, MLWORK )      
+         IF (  LWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -31
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -33
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'SGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          WORK(1)  = MLWORK
+          WORK(2)  = OLWORK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >>N , at this place 
+!     one can use an out of core QRF.
+!   
+      CALL SGEQRF( M, N, F, LDF, WORK,         & 
+                   WORK(MINMN+1), LWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL SLASET( 'L', MINMN, N-1, ZERO,  ZERO, X, LDX )
+      CALL SLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL SLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL SLASET( 'L', MINMN-2, N-2, ZERO,  ZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL SGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,  &
+                   N-1, X, LDX, Y, LDY, NRNK,   TOL, K,     &
+                   REIG, IMEIG, Z, LDZ, RES, B, LDB, V,     &
+                   LDV, S, LDS, WORK(MINMN+1), LWORK-MINMN, IWORK,  & 
+                   LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL SLASET( 'A', M-MINMN, K, ZERO, &
+                                     ZERO, Z(MINMN+1,1), LDZ )
+        CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+                     LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by SGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by SGEDMD.
+        CALL SLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL SLASET( 'A', M-N, K, ZERO, ZERO,   & 
+                                  Z(N+1,1), LDZ )
+        CALL SORMQR( 'L','N', M, K, MINMN, F, LDF, WORK, Z,  &
+             LDZ, WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to SGEDMDQ is to be 
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL SLASET( 'A', MINMN, N, ZERO,  ZERO, Y, LDY )
+         CALL SLACPY( 'U', MINMN, N, F, LDF,      Y, LDY )
+      END IF    
+!
+!     The orthonormal/orthogonal factor in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN  ! Q overwrites F 
+         CALL SORGQR( M, MINMN, MINMN, F, LDF, WORK, &
+              WORK(MINMN+N), LWORK-(MINMN+N-1), INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE SGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/sgees.f b/lapack-netlib/SRC/sgees.f
index 6febd549c..4418ea064 100644
--- a/lapack-netlib/SRC/sgees.f
+++ b/lapack-netlib/SRC/sgees.f
@@ -208,7 +208,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup gees
 *
 *  =====================================================================
       SUBROUTINE SGEES( JOBVS, SORT, SELECT, N, A, LDA, SDIM, WR, WI,
@@ -250,14 +250,14 @@
       REAL               DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD,
-     $                   SLACPY, SLASCL, SORGHR, SSWAP, STRSEN, XERBLA
+      EXTERNAL           SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY,
+     $                   SLASCL, SORGHR, SSWAP, STRSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -312,7 +312,7 @@
                MAXWRK = MAX( MAXWRK, N + HSWORK )
             END IF
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -13
@@ -338,7 +338,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -524,7 +523,7 @@
    30    CONTINUE
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGEES
diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f
index 6810fe7c8..cabe9f1f7 100644
--- a/lapack-netlib/SRC/sgeesx.f
+++ b/lapack-netlib/SRC/sgeesx.f
@@ -272,7 +272,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup geesx
 *
 *  =====================================================================
       SUBROUTINE SGEESX( JOBVS, SORT, SELECT, SENSE, N, A, LDA, SDIM,
@@ -317,14 +317,14 @@
       REAL               DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD,
+      EXTERNAL           SCOPY, SGEBAK, SGEBAL, SGEHRD, SHSEQR,
      $                   SLACPY, SLASCL, SORGHR, SSWAP, STRSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -398,7 +398,7 @@
      $         LIWRK = ( N*N )/4
          END IF
          IWORK( 1 ) = LIWRK
-         WORK( 1 ) = LWRK
+         WORK( 1 ) = SROUNDUP_LWORK(LWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -16
@@ -426,7 +426,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -634,7 +633,7 @@
    30    CONTINUE
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IF( WANTSV .OR. WANTSB ) THEN
          IWORK( 1 ) = SDIM*(N-SDIM)
       ELSE
diff --git a/lapack-netlib/SRC/sgeev.f b/lapack-netlib/SRC/sgeev.f
index ed1724721..93f993265 100644
--- a/lapack-netlib/SRC/sgeev.f
+++ b/lapack-netlib/SRC/sgeev.f
@@ -184,7 +184,7 @@
 *
 *  @generated from dgeev.f, fortran d -> s, Tue Apr 19 01:47:44 2016
 *
-*> \ingroup realGEeigen
+*> \ingroup geev
 *
 *  =====================================================================
       SUBROUTINE SGEEV( JOBVL, JOBVR, N, A, LDA, WR, WI, VL, LDVL, VR,
@@ -223,16 +223,15 @@
       REAL   DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, SLACPY,
-     $                   SLARTG, SLASCL, SORGHR, SROT, SSCAL, STREVC3,
-     $                   XERBLA
+      EXTERNAL           SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY, SLARTG,
+     $                   SLASCL, SORGHR, SROT, SSCAL, STREVC3, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX, ILAENV
-      REAL   SLAMCH, SLANGE, SLAPY2, SNRM2
+      REAL               SLAMCH, SLANGE, SLAPY2, SNRM2, SROUNDUP_LWORK
       EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SLANGE, SLAPY2,
-     $                   SNRM2
+     $                   SNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -312,7 +311,7 @@
             END IF
             MAXWRK = MAX( MAXWRK, MINWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -13
@@ -336,7 +335,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -519,7 +517,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGEEV
diff --git a/lapack-netlib/SRC/sgeevx.f b/lapack-netlib/SRC/sgeevx.f
index ed1ea1cb9..b0af78605 100644
--- a/lapack-netlib/SRC/sgeevx.f
+++ b/lapack-netlib/SRC/sgeevx.f
@@ -297,7 +297,7 @@
 *
 *  @generated from dgeevx.f, fortran d -> s, Tue Apr 19 01:47:44 2016
 *
-*> \ingroup realGEeigen
+*> \ingroup geevx
 *
 *  =====================================================================
       SUBROUTINE SGEEVX( BALANC, JOBVL, JOBVR, SENSE, N, A, LDA, WR, WI,
@@ -341,16 +341,16 @@
       REAL               DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLABAD, SLACPY,
+      EXTERNAL           SGEBAK, SGEBAL, SGEHRD, SHSEQR, SLACPY,
      $                   SLARTG, SLASCL, SORGHR, SROT, SSCAL, STREVC3,
      $                   STRSNA, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX, ILAENV
-      REAL               SLAMCH, SLANGE, SLAPY2, SNRM2
+      REAL               SLAMCH, SLANGE, SLAPY2, SNRM2, SROUNDUP_LWORK
       EXTERNAL           LSAME, ISAMAX, ILAENV, SLAMCH, SLANGE, SLAPY2,
-     $                   SNRM2
+     $                   SNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SQRT
@@ -453,7 +453,7 @@
             END IF
             MAXWRK = MAX( MAXWRK, MINWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -21
@@ -477,7 +477,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -684,7 +683,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGEEVX
diff --git a/lapack-netlib/SRC/sgehrd.f b/lapack-netlib/SRC/sgehrd.f
index 41b9aa78e..cfa17e156 100644
--- a/lapack-netlib/SRC/sgehrd.f
+++ b/lapack-netlib/SRC/sgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -173,7 +173,7 @@
       INTEGER            IHI, ILO, INFO, LDA, LWORK, N
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), TAU( * ), WORK( * )
+      REAL               A( LDA, * ), TAU( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -182,7 +182,7 @@
       INTEGER            NBMAX, LDT, TSIZE
       PARAMETER          ( NBMAX = 64, LDT = NBMAX+1,
      $                     TSIZE = LDT*NBMAX )
-      REAL              ZERO, ONE
+      REAL               ZERO, ONE
       PARAMETER          ( ZERO = 0.0E+0,
      $                     ONE = 1.0E+0 )
 *     ..
@@ -190,7 +190,7 @@
       LOGICAL            LQUERY
       INTEGER            I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB,
      $                   NBMIN, NH, NX
-      REAL              EI
+      REAL               EI
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SAXPY, SGEHD2, SGEMM, SLAHR2, SLARFB, STRMM,
@@ -201,7 +201,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -221,13 +222,19 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *       Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'SGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         ENDIF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -248,7 +255,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -268,7 +274,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
@@ -344,7 +350,8 @@
 *     Use unblocked code to reduce the rest of the matrix
 *
       CALL SGEHD2( N, I, IHI, A, LDA, TAU, WORK, IINFO )
-      WORK( 1 ) = LWKOPT
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f
index 923573bdb..42d2b8313 100644
--- a/lapack-netlib/SRC/sgejsv.f
+++ b/lapack-netlib/SRC/sgejsv.f
@@ -253,7 +253,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^t. In that case, [U] is computed
 *>                         in V as right singular vectors of A^t and then
 *>                         copied back to the U array. This 'W' option is just
@@ -1386,7 +1386,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL SLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1409,7 +1409,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1454,7 +1454,7 @@
 * :)           .. the input matrix A is very likely a relative of
 *              the Kahan matrix :)
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^T*V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^T*V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL SGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f
index 9209f918e..75f02675d 100644
--- a/lapack-netlib/SRC/sgelq.f
+++ b/lapack-netlib/SRC/sgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -190,7 +192,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGELQT, SLASWLQ, XERBLA
@@ -292,9 +295,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = LWMIN
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = LWREQ
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -319,7 +322,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LWREQ
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
       RETURN
 *
 *     End of SGELQ
diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f
index 24d8ab19c..3b3913d84 100644
--- a/lapack-netlib/SRC/sgelqf.f
+++ b/lapack-netlib/SRC/sgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -118,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -167,16 +168,16 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'SGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -184,19 +185,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -266,7 +273,7 @@
      $   CALL SGELQ2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of SGELQF
diff --git a/lapack-netlib/SRC/sgelqt3.f b/lapack-netlib/SRC/sgelqt3.f
index 23816b4c8..82f5c1cf4 100644
--- a/lapack-netlib/SRC/sgelqt3.f
+++ b/lapack-netlib/SRC/sgelqt3.f
@@ -158,7 +158,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL SLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL SLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1 ) )
 *
       ELSE
 *
diff --git a/lapack-netlib/SRC/sgels.f b/lapack-netlib/SRC/sgels.f
index ea02c3318..b58f70c9e 100644
--- a/lapack-netlib/SRC/sgels.f
+++ b/lapack-netlib/SRC/sgels.f
@@ -175,7 +175,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gels
 *
 *  =====================================================================
       SUBROUTINE SGELS( TRANS, M, N, NRHS, A, LDA, B, LDB, WORK, LWORK,
@@ -210,15 +210,15 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGELQF, SGEQRF, SLABAD, SLASCL, SLASET, SORMLQ,
+      EXTERNAL           SGELQF, SGEQRF, SLASCL, SLASET, SORMLQ,
      $                   SORMQR, STRTRS, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX, MIN, REAL
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -273,7 +273,7 @@
          END IF
 *
          WSIZE = MAX( 1, MN + MAX( MN, NRHS )*NB )
-         WORK( 1 ) = REAL( WSIZE )
+         WORK( 1 ) = SROUNDUP_LWORK( WSIZE )
 *
       END IF
 *
@@ -295,7 +295,6 @@
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -492,7 +491,7 @@
       END IF
 *
    50 CONTINUE
-      WORK( 1 ) = REAL( WSIZE )
+      WORK( 1 ) = SROUNDUP_LWORK( WSIZE )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgelsd.f b/lapack-netlib/SRC/sgelsd.f
index f5f17d34c..2818213f4 100644
--- a/lapack-netlib/SRC/sgelsd.f
+++ b/lapack-netlib/SRC/sgelsd.f
@@ -59,12 +59,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -195,7 +189,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelsd
 *
 *> \par Contributors:
 *  ==================
@@ -235,13 +229,13 @@
       REAL               ANRM, BIGNUM, BNRM, EPS, SFMIN, SMLNUM
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEBRD, SGELQF, SGEQRF, SLABAD, SLACPY, SLALSD,
-     $                   SLASCL, SLASET, SORMBR, SORMLQ, SORMQR, XERBLA
+      EXTERNAL           SGEBRD, SGELQF, SGEQRF, SLACPY, SLALSD, SLASCL,
+     $                   SLASET, SORMBR, SORMLQ, SORMQR, XERBLA
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           SLAMCH, SLANGE, ILAENV
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SLANGE, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, LOG, MAX, MIN, REAL
@@ -354,7 +348,7 @@
             END IF
          END IF
          MINWRK = MIN( MINWRK, MAXWRK )
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
          IWORK( 1 ) = LIWORK
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
@@ -382,7 +376,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max entry outside range [SMLNUM,BIGNUM].
 *
@@ -621,7 +614,7 @@
       END IF
 *
    10 CONTINUE
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IWORK( 1 ) = LIWORK
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgelss.f b/lapack-netlib/SRC/sgelss.f
index 9aed4329f..2e4b0cdd5 100644
--- a/lapack-netlib/SRC/sgelss.f
+++ b/lapack-netlib/SRC/sgelss.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE SGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -202,13 +202,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SBDSQR, SCOPY, SGEBRD, SGELQF, SGEMM, SGEMV,
-     $                   SGEQRF, SLABAD, SLACPY, SLASCL, SLASET, SORGBR,
+     $                   SGEQRF, SLACPY, SLASCL, SLASET, SORGBR,
      $                   SORMBR, SORMLQ, SORMQR, SRSCL, XERBLA
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -355,7 +355,7 @@
             END IF
             MAXWRK = MAX( MINWRK, MAXWRK )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY )
      $      INFO = -12
@@ -381,7 +381,6 @@
       SFMIN = SLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -525,7 +524,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL SLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL SGEMV( 'T', N, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL SCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -622,7 +621,7 @@
                CALL SLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL SGEMV( 'T', M, M, ONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, ZERO, WORK( IWORK ), 1 )
             CALL SCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -708,7 +707,7 @@
      $                     LDB, ZERO, WORK, N )
                CALL SLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL SGEMV( 'T', M, N, ONE, A, LDA, B, 1, ZERO, WORK, 1 )
             CALL SCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -732,7 +731,7 @@
       END IF
 *
    70 CONTINUE
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGELSS
diff --git a/lapack-netlib/SRC/sgelst.f b/lapack-netlib/SRC/sgelst.f
index 5377bc720..b89918656 100644
--- a/lapack-netlib/SRC/sgelst.f
+++ b/lapack-netlib/SRC/sgelst.f
@@ -176,7 +176,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelst
 *
 *> \par Contributors:
 *  ==================
@@ -222,15 +222,15 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGELQT, SGEQRT, SGEMLQT, SGEMQRT, SLABAD,
+      EXTERNAL           SGELQT, SGEQRT, SGEMLQT, SGEMQRT,
      $                   SLASCL, SLASET, STRTRS, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -268,7 +268,7 @@
 *
          MNNRHS = MAX( MN, NRHS )
          LWOPT = MAX( 1, (MN+MNNRHS)*NB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
 *
       END IF
 *
@@ -283,7 +283,7 @@
 *
       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
          CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
          RETURN
       END IF
 *
@@ -309,7 +309,6 @@
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -332,7 +331,7 @@
 *        Matrix all zero. Return zero solution.
 *
          CALL SLASET( 'Full', MAX( M, N ), NRHS, ZERO, ZERO, B, LDB )
-         WORK( 1 ) = REAL( LWOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
          RETURN
       END IF
 *
@@ -522,7 +521,7 @@
      $                INFO )
       END IF
 *
-      WORK( 1 ) = REAL( LWOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgelsy.f b/lapack-netlib/SRC/sgelsy.f
index 9c60f78a7..c7f5069de 100644
--- a/lapack-netlib/SRC/sgelsy.f
+++ b/lapack-netlib/SRC/sgelsy.f
@@ -115,6 +115,7 @@
 *>          B is REAL array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -147,6 +148,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -189,7 +191,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup gelsy
 *
 *> \par Contributors:
 *  ==================
@@ -232,11 +234,11 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEQP3, SLABAD, SLAIC1, SLASCL, SLASET,
+      EXTERNAL           SCOPY, SGEQP3, SLAIC1, SLASCL, SLASET,
      $                   SORMQR, SORMRZ, STRSM, STZRZF, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -280,7 +282,7 @@
             LWKOPT = MAX( LWKMIN,
      $                    MN + 2*N + NB*( N + 1 ), 2*MN + NB*NRHS )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
@@ -305,7 +307,6 @@
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max entries outside range [SMLNUM,BIGNUM]
 *
@@ -467,7 +468,7 @@
       END IF
 *
    70 CONTINUE
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f
index 536abf184..7e4d9bf65 100644
--- a/lapack-netlib/SRC/sgemlq.f
+++ b/lapack-netlib/SRC/sgemlq.f
@@ -110,16 +110,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -143,7 +144,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -159,11 +160,13 @@
 *>  block sizes MB and NB returned by ILAENV, SGELQ will use either
 *>  SLASWLQ (if the matrix is wide-and-short) or SGELQT to compute
 *>  the LQ factorization.
-*>  This version of SGEMLQ will use either SLAMSWLQ or SGEMLQT to 
+*>  This version of SGEMLQ will use either SLAMSWLQ or SGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in SLAMSWLQ or SGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE SGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -185,12 +188,16 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SLAMSWLQ, SGEMLQT, XERBLA
 *     ..
@@ -201,7 +208,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -216,6 +223,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -244,12 +258,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = REAL( LW )
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -261,7 +275,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -274,7 +288,7 @@
      $                 MB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = REAL( LW )
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f
index 2a9257459..19bf467b8 100644
--- a/lapack-netlib/SRC/sgemqr.f
+++ b/lapack-netlib/SRC/sgemqr.f
@@ -120,7 +120,7 @@
 *>          The dimension of the array WORK.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +144,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -160,12 +160,14 @@
 *>  block sizes MB and NB returned by ILAENV, SGEQR will use either
 *>  SLATSQR (if the matrix is tall-and-skinny) or SGEQRT to compute
 *>  the QR factorization.
-*>  This version of SGEMQR will use either SLAMTSQR or SGEMQRT to 
+*>  This version of SGEMQR will use either SLAMTSQR or SGEMQRT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in SLAMTSQR or SGEMQRT.
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE SGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,11 +189,13 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMQRT, SLAMTSQR, XERBLA
@@ -203,7 +207,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +222,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -246,12 +257,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +274,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +287,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgeqlf.f b/lapack-netlib/SRC/sgeqlf.f
index efecfbb3c..14942b765 100644
--- a/lapack-netlib/SRC/sgeqlf.f
+++ b/lapack-netlib/SRC/sgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -113,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -162,7 +163,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -186,10 +188,11 @@
             NB = ILAENV( 1, 'SGEQLF', ' ', M, N, -1, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
@@ -276,7 +279,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL SGEQL2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGEQLF
diff --git a/lapack-netlib/SRC/sgeqp3.f b/lapack-netlib/SRC/sgeqp3.f
index 493bdae6a..9f2f40b2e 100644
--- a/lapack-netlib/SRC/sgeqp3.f
+++ b/lapack-netlib/SRC/sgeqp3.f
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup geqp3
 *
 *> \par Further Details:
 *  =====================
@@ -177,8 +177,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      REAL               SNRM2
-      EXTERNAL           ILAENV, SNRM2
+      REAL               SNRM2, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SNRM2, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -205,7 +205,7 @@
             NB = ILAENV( INB, 'SGEQRF', ' ', M, N, -1, -1 )
             LWKOPT = 2*N + ( N + 1 )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
@@ -347,7 +347,7 @@
 *
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGEQP3
diff --git a/lapack-netlib/SRC/sgeqp3rk.c b/lapack-netlib/SRC/sgeqp3rk.c
new file mode 100644
index 000000000..fe52901bf
--- /dev/null
+++ b/lapack-netlib/SRC/sgeqp3rk.c
@@ -0,0 +1,1055 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int sgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, real *abstol, real *reltol, real *a, integer *lda, integer *k, 
+	real *maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, real *tau, real *
+	work, integer *lwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    real maxc2nrm;
+    extern /* Subroutine */ int slaqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, real *, real *, integer *, real *, real *, 
+	    integer *, integer *, real *, real *, integer *, real *, real *, 
+	    real *, real *, integer *), slaqp3rk_(integer *, integer *, 
+	    integer *, integer *, integer *, real *, real *, integer *, real *
+	    , real *, integer *, logical *, integer *, real *, real *, 
+	    integer *, real *, real *, real *, real *, real *, integer *, 
+	    integer *, integer *);
+    logical done;
+    integer jmax;
+    extern real snrm2_(integer *, real *, integer *);
+    integer j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    real myhugeval;
+    integer jb, nb, kf, nx;
+    extern real slamch_(char *);
+    real safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen), isamax_(integer *, real *, 
+	    integer *);
+    extern logical sisnan_(real *);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    real eps;
+    integer iws, ioffset;
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (sisnan_(abstol)) {
+	*info = -5;
+    } else if (sisnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in SLAQP2RK. */
+/*           1) SGEQP3RK and SLAQP2RK: 2*N to store full and partial */
+/*              column 2-norms. */
+/*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in SLARF subroutine inside SLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n * 3 + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "SGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in SLAQP2RK and blocked BLAS 3 code */
+/*           in SLAQP3RK. */
+/*           1) SGEQP3RK, SLAQP2RK, SLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in SLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) SLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) SLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	work[1] = (real) lwkopt;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("SGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	work[j] = snrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	work[*n + j] = work[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = isamax_(n, &work[1], &c__1);
+    maxc2nrm = work[kp1];
+
+/*     ==================================================================. */
+
+    if (sisnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.f) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.f;
+	}
+
+	work[1] = (real) lwkopt;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = slamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.f;
+	}
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = slamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.f) {
+	safmin = slamch_("Safe minimum");
+/* Computing MAX */
+	r__1 = *abstol, r__2 = safmin * 2.f;
+	*abstol = f2cmax(r__1,r__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.f) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1.f <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.f;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    tau[j] = 0.f;
+	}
+
+	work[1] = (real) lwkopt;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "SGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "SGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    slaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &
+		    work[*n + j], &work[(*n << 1) + 1], &work[(*n << 1) + jb 
+		    + 1], &i__1, &iwork[1], &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		work[1] = (real) lwkopt;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	slaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &work[j], &work[*n + j], &
+		work[(*n << 1) + 1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + isamax_(&i__1, &work[*k + 1], &c__1);
+	    *maxc2nrmk = work[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.f;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		tau[j] = 0.f;
+	    }
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    work[1] = (real) lwkopt;
+
+    return 0;
+
+/*     End of SGEQP3RK */
+
+} /* sgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/sgeqp3rk.f b/lapack-netlib/SRC/sgeqp3rk.f
new file mode 100644
index 000000000..d3a335b88
--- /dev/null
+++ b/lapack-netlib/SRC/sgeqp3rk.f
@@ -0,0 +1,1083 @@
+*> \brief \b SGEQP3RK computes a truncated Householder QR factorization with column pivoting of a real m-by-n matrix A by using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       REAL               ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       REAL               A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a real
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M orthogonal matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a real M-by-NRHS
+*> matrix B with  Q(K)**T * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**T * B, the matrix B with the orthogonal
+*>        transformation Q(K)**T applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M orthogonal matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**T,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a real scalar,
+*>   v    is a real vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = SLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = SLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 orthogonal matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**T * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= (3*N+NRHS-1), otherwise.
+*>          For optimal performance LWORK >= (2*N + NB*( N+NRHS+1 )),
+*>          where NB is the optimal block size for SGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine SLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> SGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in SGEQP3 routine which uses
+*> SLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      REAL               ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      REAL               ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0, TWO = 2.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      REAL               EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLAQP2RK, SLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX, ILAENV
+      REAL               SLAMCH, SNRM2, SROUNDUP_LWORK
+      EXTERNAL           SISNAN, SLAMCH, SNRM2, ISAMAX, ILAENV,
+     $                   SROUNDUP_LWORK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( SISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( SISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in SLAQP2RK.
+*           1) SGEQP3RK and SLAQP2RK: 2*N to store full and partial
+*              column 2-norms.
+*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in SLARF subroutine inside SLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = 3*N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'SGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in SLAQP2RK and blocked BLAS 3 code
+*           in SLAQP3RK.
+*           1) SGEQP3RK, SLAQP2RK, SLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) SLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in SLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) SLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) SLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         WORK( J ) = SNRM2( M, A( 1, J ), 1 )
+         WORK( N+J ) = WORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = ISAMAX( N, WORK( 1 ), 1 )
+      MAXC2NRM = WORK( KP1 )
+*
+*     ==================================================================.
+*
+      IF( SISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = SLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = SLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = ZERO
+         END DO
+*
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'SGEQP3RK', ' ', M, N, -1, -1 ))
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'SGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL SLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     WORK( J ), WORK( N+J ),
+     $                     WORK( 2*N+1 ), WORK( 2*N+JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL SLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), WORK( J ), WORK( N+J ),
+     $                  WORK( 2*N+1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + ISAMAX( N-K, WORK( K+1 ), 1 )
+            MAXC2NRMK = WORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = ZERO
+            END DO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
+      RETURN
+*
+*     End of SGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f
index 6f41a92ea..79a515e1c 100644
--- a/lapack-netlib/SRC/sgeqr.f
+++ b/lapack-netlib/SRC/sgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,11 +190,13 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLATSQR, SGEQRT, XERBLA
@@ -244,8 +248,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +259,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +274,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +288,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +315,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = SROUNDUP_LWORK( LWREQ )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f
index b24615f7a..689fe1aea 100644
--- a/lapack-netlib/SRC/sgeqrf.f
+++ b/lapack-netlib/SRC/sgeqrf.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup geqrf
 *
 *> \par Further Details:
 *  =====================
@@ -170,7 +170,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -199,7 +200,7 @@
          ELSE
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -274,7 +275,7 @@
      $   CALL SGEQR2( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGEQRF
diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f
index 03d33654b..37747c512 100644
--- a/lapack-netlib/SRC/sgeqrfp.f
+++ b/lapack-netlib/SRC/sgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -122,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEQR2P, SLARFB, SLARFT, XERBLA
@@ -174,6 +175,8 @@
 *     .. External Functions ..
       INTEGER            ILAENV
       EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,8 +184,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'SGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = LWKOPT
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -190,7 +201,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -210,7 +221,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
@@ -272,7 +283,7 @@
      $   CALL SGEQR2P( M-I+1, N-I+1, A( I, I ), LDA, TAU( I ), WORK,
      $                IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of SGEQRFP
diff --git a/lapack-netlib/SRC/sgerqf.f b/lapack-netlib/SRC/sgerqf.f
index 037cd5345..1d3400a1f 100644
--- a/lapack-netlib/SRC/sgerqf.f
+++ b/lapack-netlib/SRC/sgerqf.f
@@ -114,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gerqf
 *
 *> \par Further Details:
 *  =====================
@@ -163,7 +163,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -187,7 +188,7 @@
             NB = ILAENV( 1, 'SGERQF', ' ', M, N, -1, -1 )
             LWKOPT = M*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF ( .NOT.LQUERY ) THEN
             IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
@@ -278,7 +279,7 @@
       IF( MU.GT.0 .AND. NU.GT.0 )
      $   CALL SGERQ2( MU, NU, A, LDA, TAU, WORK, IINFO )
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SGERQF
diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f
index d3b5e3ba1..d271bb757 100644
--- a/lapack-netlib/SRC/sgesdd.f
+++ b/lapack-netlib/SRC/sgesdd.f
@@ -55,12 +55,6 @@
 *>
 *> Note that the routine returns VT = V**T, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/sgesvd.f b/lapack-netlib/SRC/sgesvd.f
index 83321ffaa..d3fa94582 100644
--- a/lapack-netlib/SRC/sgesvd.f
+++ b/lapack-netlib/SRC/sgesvd.f
@@ -203,7 +203,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsing
+*> \ingroup gesvd
 *
 *  =====================================================================
       SUBROUTINE SGESVD( JOBU, JOBVT, M, N, A, LDA, S, U, LDU, VT, LDVT,
@@ -251,8 +251,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, SQRT
@@ -628,7 +628,7 @@
             END IF
          END IF
          MAXWRK = MAX( MAXWRK, MINWRK )
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -13
@@ -3493,7 +3493,7 @@
 *
 *     Return optimal workspace in WORK(1)
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgesvdq.f b/lapack-netlib/SRC/sgesvdq.f
index ebbfc70b1..6fb328b8c 100644
--- a/lapack-netlib/SRC/sgesvdq.f
+++ b/lapack-netlib/SRC/sgesvdq.f
@@ -365,7 +365,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/sgesvdx.f b/lapack-netlib/SRC/sgesvdx.f
index b6495dbd4..8b55b9b2e 100644
--- a/lapack-netlib/SRC/sgesvdx.f
+++ b/lapack-netlib/SRC/sgesvdx.f
@@ -254,7 +254,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsing
+*> \ingroup gesvdx
 *
 *  =====================================================================
       SUBROUTINE SGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU,
@@ -301,8 +301,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, SQRT
@@ -456,7 +456,7 @@
             END IF
          END IF
          MAXWRK = MAX( MAXWRK, MINWRK )
-         WORK( 1 ) = REAL( MAXWRK )
+         WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
              INFO = -19
@@ -822,7 +822,7 @@
 *
 *     Return optimal workspace in WORK(1)
 *
-      WORK( 1 ) = REAL( MAXWRK )
+      WORK( 1 ) = SROUNDUP_LWORK( MAXWRK )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f
index 3f53a5a15..36aed2853 100644
--- a/lapack-netlib/SRC/sgesvj.f
+++ b/lapack-netlib/SRC/sgesvj.f
@@ -208,7 +208,7 @@
 *>
 *> \param[in,out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On entry,
 *>          If JOBU = 'C' :
 *>          WORK(1) = CTOL, where CTOL defines the threshold for convergence.
@@ -239,7 +239,12 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>         length of WORK, WORK >= MAX(6,M+N)
+*>          Length of WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(6,M+N), otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; WORK(1) is set to the minial (and optimal)
+*>          length of WORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -260,7 +265,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -351,9 +356,9 @@
       INTEGER            BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
      $                   ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
      $                   N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP,
-     $                   SWBAND
-      LOGICAL            APPLV, GOSCALE, LOWER, LSVEC, NOSCALE, ROTOK,
-     $                   RSVEC, UCTOL, UPPER
+     $                   SWBAND, MINMN, LWMIN
+      LOGICAL            APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE,
+     $                   ROTOK, RSVEC, UCTOL, UPPER
 *     ..
 *     .. Local Arrays ..
       REAL               FASTR( 5 )
@@ -369,8 +374,8 @@
       INTEGER            ISAMAX
       EXTERNAL           ISAMAX
 *     from LAPACK
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
       LOGICAL            LSAME
       EXTERNAL           LSAME
 *     ..
@@ -394,6 +399,14 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = MAX( 6, M+N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -413,7 +426,7 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( WORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( LWORK.LT.MAX( M+N, 6 ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
       ELSE
          INFO = 0
@@ -423,11 +436,14 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SGESVJ', -INFO )
          RETURN
+      ELSE IF( LQUERY ) THEN
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+         RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/sgetri.f b/lapack-netlib/SRC/sgetri.f
index 749ede9a7..7b06bb63d 100644
--- a/lapack-netlib/SRC/sgetri.f
+++ b/lapack-netlib/SRC/sgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE SGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -138,6 +138,8 @@
 *     .. External Functions ..
       INTEGER            ILAENV
       EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SGEMV, SSWAP, STRSM, STRTRI, XERBLA
@@ -151,8 +153,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'SGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
-      WORK( 1 ) = LWKOPT
+      LWKOPT = MAX( 1, N*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -250,7 +253,7 @@
      $      CALL SSWAP( N, A( 1, J ), 1, A( 1, JP ), 1 )
    60 CONTINUE
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK( IWS )
       RETURN
 *
 *     End of SGETRI
diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f
index e6ce705fa..08a427a8b 100644
--- a/lapack-netlib/SRC/sgetsls.f
+++ b/lapack-netlib/SRC/sgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE SGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -188,15 +188,15 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, SLABAD, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEQR, SGEMQR, SLASCL, SLASET,
      $                   STRTRS, XERBLA, SGELQ, SGEMLQ
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN, INT
+      INTRINSIC          MAX, MIN, INT
 *     ..
 *     .. Executable Statements ..
 *
@@ -226,7 +226,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEO = 1
+         WSIZEM = 1
+       ELSE IF( M.GE.N ) THEN
          CALL SGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
@@ -262,7 +265,7 @@
           INFO = -10
        END IF
 *
-       WORK( 1 ) = REAL( WSIZEO )
+       WORK( 1 ) = SROUNDUP_LWORK( WSIZEO )
 *
       END IF
 *
@@ -271,7 +274,7 @@
         RETURN
       END IF
       IF( LQUERY ) THEN
-        IF( LWORK.EQ.-2 ) WORK( 1 ) = REAL( WSIZEM )
+        IF( LWORK.EQ.-2 ) WORK( 1 ) = SROUNDUP_LWORK( WSIZEM )
         RETURN
       END IF
       IF( LWORK.LT.WSIZEO ) THEN
@@ -294,7 +297,6 @@
 *
        SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
@@ -482,7 +484,7 @@
       END IF
 *
    50 CONTINUE
-      WORK( 1 ) = REAL( TSZO + LWO )
+      WORK( 1 ) = SROUNDUP_LWORK( TSZO + LWO )
       RETURN
 *
 *     End of SGETSLS
diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f
index f9580da7b..7ade8a66c 100644
--- a/lapack-netlib/SRC/sgetsqrhrt.f
+++ b/lapack-netlib/SRC/sgetsqrhrt.f
@@ -130,14 +130,17 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleOTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -200,6 +203,10 @@
       INTEGER            I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT,
      $                   NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL,
      $                   XERBLA
@@ -212,7 +219,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +232,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +270,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
@@ -277,14 +285,14 @@
          CALL XERBLA( 'SGETSQRHRT', -INFO )
          RETURN
       ELSE IF ( LQUERY ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
       IF( MIN( M, N ).EQ.0 ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
@@ -341,9 +349,9 @@
          END IF
       END DO
 *
-      WORK( 1 ) = REAL( LWORKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
       RETURN
 *
 *     End of SGETSQRHRT
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/sgges.f b/lapack-netlib/SRC/sgges.f
index 3834aea00..8f42882dd 100644
--- a/lapack-netlib/SRC/sgges.f
+++ b/lapack-netlib/SRC/sgges.f
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup gges
 *
 *  =====================================================================
       SUBROUTINE SGGES( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B, LDB,
@@ -321,15 +321,14 @@
       REAL               DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGSEN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -406,7 +405,7 @@
             MINWRK = 1
             MAXWRK = 1
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY )
      $      INFO = -19
@@ -431,7 +430,6 @@
       EPS = SLAMCH( 'P' )
       SAFMIN = SLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -668,7 +666,7 @@
 *
    40 CONTINUE
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgges3.f b/lapack-netlib/SRC/sgges3.f
index b27704ff5..e90cd6947 100644
--- a/lapack-netlib/SRC/sgges3.f
+++ b/lapack-netlib/SRC/sgges3.f
@@ -234,6 +234,8 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= 6*N+16.
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -273,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE SGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -309,7 +311,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, LST2SL, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT
+     $                   ILO, IP, IRIGHT, IROWS, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SAFMAX, SAFMIN, SMLNUM
 *     ..
@@ -318,14 +321,13 @@
       REAL               DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -362,6 +364,12 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 6*N+16
+      END IF
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -378,7 +386,7 @@
          INFO = -15
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -17
-      ELSE IF( LWORK.LT.6*N+16 .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -19
       END IF
 *
@@ -386,7 +394,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 6*N+16, 3*N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
          LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
@@ -408,7 +416,11 @@
      $                   IERR )
             LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -422,6 +434,7 @@
 *
       IF( N.EQ.0 ) THEN
          SDIM = 0
+         WORK( 1 ) = 1
          RETURN
       END IF
 *
@@ -430,7 +443,6 @@
       EPS = SLAMCH( 'P' )
       SAFMIN = SLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -659,7 +671,7 @@
 *
    40 CONTINUE
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sggesx.f b/lapack-netlib/SRC/sggesx.f
index a6c0443ba..e5a14fc19 100644
--- a/lapack-netlib/SRC/sggesx.f
+++ b/lapack-netlib/SRC/sggesx.f
@@ -337,7 +337,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup ggesx
 *
 *> \par Further Details:
 *  =====================
@@ -405,15 +405,14 @@
       REAL               DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGSEN,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGSEN, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -510,7 +509,7 @@
             MAXWRK = 1
             LWRK   = 1
          END IF
-         WORK( 1 ) = LWRK
+         WORK( 1 ) = SROUNDUP_LWORK(LWRK)
          IF( WANTSN .OR. N.EQ.0 ) THEN
             LIWMIN = 1
          ELSE
@@ -544,7 +543,6 @@
       EPS = SLAMCH( 'P' )
       SAFMIN = SLAMCH( 'S' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       SMLNUM = SQRT( SAFMIN ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -807,7 +805,7 @@
 *
    50 CONTINUE
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/sggev.f b/lapack-netlib/SRC/sggev.f
index 69744b72b..cacad7cac 100644
--- a/lapack-netlib/SRC/sggev.f
+++ b/lapack-netlib/SRC/sggev.f
@@ -218,7 +218,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup ggev
 *
 *  =====================================================================
       SUBROUTINE SGGEV( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR, ALPHAI,
@@ -257,15 +257,14 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGEVC, XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -335,7 +334,7 @@
             MAXWRK = MAX( MAXWRK, N*( 7 +
      $                 ILAENV( 1, 'SORGQR', ' ', N, 1, N, -1 ) ) )
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY )
      $      INFO = -16
@@ -358,7 +357,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -581,7 +579,7 @@
          CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGGEV
diff --git a/lapack-netlib/SRC/sggev3.f b/lapack-netlib/SRC/sggev3.f
index 945c3a017..d788d1147 100644
--- a/lapack-netlib/SRC/sggev3.f
+++ b/lapack-netlib/SRC/sggev3.f
@@ -189,6 +189,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK >= MAX(1,8*N).
+*>          For good performance, LWORK should generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -217,7 +219,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE SGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHAR,
@@ -248,7 +250,8 @@
       LOGICAL            ILASCL, ILBSCL, ILV, ILVL, ILVR, LQUERY
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
-     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT
+     $                   IN, IRIGHT, IROWS, ITAU, IWRK, JC, JR, LWKOPT,
+     $                   LWKMIN
       REAL               ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
 *     ..
@@ -256,14 +259,13 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC,
-     $                   XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHD3, SLAQZ0, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGEVC
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -299,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 8*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -313,7 +316,7 @@
          INFO = -12
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -14
-      ELSE IF( LWORK.LT.MAX( 1, 8*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -16
       END IF
 *
@@ -321,28 +324,31 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL SGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1, 8*N, 3*N+INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, 3*N+INT( WORK( 1 ) ) )
          CALL SORMQR( 'L', 'T', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          CALL SGGHD3( JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB, VL, LDVL,
      $                VR, LDVR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
          IF( ILVL ) THEN
             CALL SORGQR( N, N, N, VL, LDVL, WORK, WORK, -1, IERR )
-            LWKOPT = MAX( LWKOPT, 3*N+INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 3*N+INT( WORK( 1 ) ) )
             CALL SLAQZ0( 'S', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
          ELSE
             CALL SLAQZ0( 'E', JOBVL, JOBVR, N, 1, N, A, LDA, B, LDB,
      $                   ALPHAR, ALPHAI, BETA, VL, LDVL, VR, LDVR,
      $                   WORK, -1, 0, IERR )
-            LWKOPT = MAX( LWKOPT, 2*N+INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, 2*N+INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          END IF
-         WORK( 1 ) = REAL( LWKOPT )
-*
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -362,7 +368,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -578,7 +583,7 @@
          CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
       END IF
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SGGEV3
diff --git a/lapack-netlib/SRC/sggevx.f b/lapack-netlib/SRC/sggevx.f
index bb05f499a..63164a021 100644
--- a/lapack-netlib/SRC/sggevx.f
+++ b/lapack-netlib/SRC/sggevx.f
@@ -352,7 +352,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEeigen
+*> \ingroup ggevx
 *
 *> \par Further Details:
 *  =====================
@@ -427,15 +427,15 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLABAD,
-     $                   SLACPY, SLASCL, SLASET, SORGQR, SORMQR, STGEVC,
-     $                   STGSNA, XERBLA
+      EXTERNAL           SGEQRF, SGGBAK, SGGBAL, SGGHRD, SHGEQZ, SLACPY,
+     $                   SLASCL, SLASET, SORGQR, SORMQR, STGEVC, STGSNA,
+     $                   XERBLA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
@@ -532,7 +532,7 @@
      $                       N*ILAENV( 1, 'SORGQR', ' ', N, 1, N, 0 ) )
             END IF
          END IF
-         WORK( 1 ) = MAXWRK
+         WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
 *
          IF( LWORK.LT.MINWRK .AND. .NOT.LQUERY ) THEN
             INFO = -26
@@ -557,7 +557,6 @@
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
@@ -855,7 +854,7 @@
          CALL SLASCL( 'G', 0, 0, BNRMTO, BNRM, N, 1, BETA, N, IERR )
       END IF
 *
-      WORK( 1 ) = MAXWRK
+      WORK( 1 ) = SROUNDUP_LWORK(MAXWRK)
       RETURN
 *
 *     End of SGGEVX
diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f
index 56b4dba52..37094e4f2 100644
--- a/lapack-netlib/SRC/sggglm.f
+++ b/lapack-netlib/SRC/sggglm.f
@@ -177,7 +177,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup ggglm
 *
 *  =====================================================================
       SUBROUTINE SGGGLM( N, M, P, A, LDA, B, LDB, D, X, Y, WORK, LWORK,
@@ -212,7 +212,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -251,7 +252,7 @@
             LWKMIN = M + N + P
             LWKOPT = M + NP + MAX( N, P )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
diff --git a/lapack-netlib/SRC/sgghd3.f b/lapack-netlib/SRC/sgghd3.f
index 23acf6ec5..01e57088a 100644
--- a/lapack-netlib/SRC/sgghd3.f
+++ b/lapack-netlib/SRC/sgghd3.f
@@ -179,14 +179,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
-*> \param[in]  LWORK
+*> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -211,7 +211,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -260,14 +260,15 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGGHRD, SLARTG, SLASET, SORM22, SROT, SGEMM,
      $                   SGEMV, STRMV, SLACPY, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX
+      INTRINSIC          MAX
 *     ..
 *     .. Executable Statements ..
 *
@@ -275,8 +276,13 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'SGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
-      WORK( 1 ) = REAL( LWKOPT )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
       INITZ = LSAME( COMPZ, 'I' )
@@ -325,7 +331,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = ONE
          RETURN
@@ -885,7 +890,8 @@
       IF ( JCOL.LT.IHI )
      $   CALL SGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
-      WORK( 1 ) = REAL( LWKOPT )
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sgglse.f b/lapack-netlib/SRC/sgglse.f
index 59addc3f4..53e3f8e45 100644
--- a/lapack-netlib/SRC/sgglse.f
+++ b/lapack-netlib/SRC/sgglse.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERsolve
+*> \ingroup gglse
 *
 *  =====================================================================
       SUBROUTINE SGGLSE( M, N, P, A, LDA, B, LDB, C, D, X, WORK, LWORK,
@@ -207,7 +207,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -246,7 +247,7 @@
             LWKMIN = M + N + P
             LWKOPT = P + MN + MAX( M, N )*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -12
diff --git a/lapack-netlib/SRC/sggqrf.f b/lapack-netlib/SRC/sggqrf.f
index 59b498da5..d32b48410 100644
--- a/lapack-netlib/SRC/sggqrf.f
+++ b/lapack-netlib/SRC/sggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -237,6 +237,8 @@
 *     .. External Functions ..
       INTEGER            ILAENV
       EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -250,8 +252,9 @@
       NB2 = ILAENV( 1, 'SGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'SORMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
-      WORK( 1 ) = LWKOPT
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
          INFO = -1
@@ -287,7 +290,9 @@
 *     RQ factorization of N-by-P matrix B: B = T*Z.
 *
       CALL SGERQF( N, P, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) )
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sggrqf.f b/lapack-netlib/SRC/sggrqf.f
index 8b7d4786a..b3842ec2a 100644
--- a/lapack-netlib/SRC/sggrqf.f
+++ b/lapack-netlib/SRC/sggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -235,7 +235,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          INT, MAX, MIN
@@ -249,8 +250,8 @@
       NB2 = ILAENV( 1, 'SGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'SORMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P)*NB
-      WORK( 1 ) = LWKOPT
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -287,7 +288,8 @@
 *     QR factorization of P-by-N matrix B: B = Z*T
 *
       CALL SGEQRF( P, N, B, LDB, TAUB, WORK, LWORK, INFO )
-      WORK( 1 ) = MAX( LOPT, INT( WORK( 1 ) ) )
+      LWKOPT = MAX( LOPT, INT( WORK( 1 ) ) )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sggsvd3.f b/lapack-netlib/SRC/sggsvd3.f
index 9077f2ea8..cee630593 100644
--- a/lapack-netlib/SRC/sggsvd3.f
+++ b/lapack-netlib/SRC/sggsvd3.f
@@ -278,7 +278,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -328,7 +328,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
@@ -372,8 +372,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANGE
-      EXTERNAL           LSAME, SLAMCH, SLANGE
+      REAL               SLAMCH, SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGGSVP3, STGSJA, XERBLA
@@ -429,7 +429,7 @@
          LWKOPT = N + INT( WORK( 1 ) )
          LWKOPT = MAX( 2*N, LWKOPT )
          LWKOPT = MAX( 1, LWKOPT )
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -492,7 +492,7 @@
          END IF
    20 CONTINUE
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SGGSVD3
diff --git a/lapack-netlib/SRC/sggsvp3.f b/lapack-netlib/SRC/sggsvp3.f
index 4f76b32bc..8e90d770c 100644
--- a/lapack-netlib/SRC/sggsvp3.f
+++ b/lapack-netlib/SRC/sggsvp3.f
@@ -227,7 +227,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -250,7 +250,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
@@ -301,6 +301,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEQP3, SGEQR2, SGERQ2, SLACPY, SLAPMT,
@@ -365,7 +367,7 @@
          CALL SGEQP3( M, N, A, LDA, IWORK, TAU, WORK, -1, INFO )
          LWKOPT = MAX( LWKOPT, INT( WORK ( 1 ) ) )
          LWKOPT = MAX( 1, LWKOPT )
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -560,7 +562,7 @@
 *
       END IF
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SGGSVP3
diff --git a/lapack-netlib/SRC/sgsvj0.f b/lapack-netlib/SRC/sgsvj0.f
index 23b6f0077..7e5fddeb0 100644
--- a/lapack-netlib/SRC/sgsvj0.f
+++ b/lapack-netlib/SRC/sgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is REAL array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sgsvj1.f b/lapack-netlib/SRC/sgsvj1.f
index 0345ccb42..f1d1f7239 100644
--- a/lapack-netlib/SRC/sgsvj1.f
+++ b/lapack-netlib/SRC/sgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is REAL array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/shgeqz.f b/lapack-netlib/SRC/shgeqz.f
index 6543f8cb1..9ad64d2bf 100644
--- a/lapack-netlib/SRC/shgeqz.f
+++ b/lapack-netlib/SRC/shgeqz.f
@@ -282,7 +282,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup hgeqz
 *
 *> \par Further Details:
 *  =====================
@@ -346,8 +346,9 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANHS, SLAPY2, SLAPY3
-      EXTERNAL           LSAME, SLAMCH, SLANHS, SLAPY2, SLAPY3
+      REAL               SLAMCH, SLANHS, SLAPY2, SLAPY3, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANHS, SLAPY2, SLAPY3,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLAG2, SLARFG, SLARTG, SLASET, SLASV2, SROT,
@@ -1364,7 +1365,7 @@
 *     Exit (other than argument error) -- return optimal workspace size
 *
   420 CONTINUE
-      WORK( 1 ) = REAL( N )
+      WORK( 1 ) = SROUNDUP_LWORK( N )
       RETURN
 *
 *     End of SHGEQZ
diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f
index 3b8d4c4d8..68b9fe6bd 100644
--- a/lapack-netlib/SRC/shseqr.f
+++ b/lapack-netlib/SRC/shseqr.f
@@ -233,7 +233,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup hseqr
 *
 *> \par Contributors:
 *  ==================
@@ -358,7 +358,8 @@
 *     .. External Functions ..
       INTEGER            ILAENV
       LOGICAL            LSAME
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLACPY, SLAHQR, SLAQR0, SLASET, XERBLA
@@ -373,7 +374,7 @@
       WANTT = LSAME( JOB, 'S' )
       INITZ = LSAME( COMPZ, 'I' )
       WANTZ = INITZ .OR. LSAME( COMPZ, 'V' )
-      WORK( 1 ) = REAL( MAX( 1, N ) )
+      WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N ) )
       LQUERY = LWORK.EQ.-1
 *
       INFO = 0
diff --git a/lapack-netlib/SRC/sla_gbrfsx_extended.f b/lapack-netlib/SRC/sla_gbrfsx_extended.f
index 499d6bf90..c79330cb5 100644
--- a/lapack-netlib/SRC/sla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/sla_gbrfsx_extended.f
@@ -644,7 +644,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/sla_gerfsx_extended.f b/lapack-netlib/SRC/sla_gerfsx_extended.f
index de05d8eb4..1a19a5071 100644
--- a/lapack-netlib/SRC/sla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/sla_gerfsx_extended.f
@@ -628,7 +628,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y( 1, J ), 1 )
diff --git a/lapack-netlib/SRC/sla_porfsx_extended.f b/lapack-netlib/SRC/sla_porfsx_extended.f
index ada4cad21..9f33b14dd 100644
--- a/lapack-netlib/SRC/sla_porfsx_extended.f
+++ b/lapack-netlib/SRC/sla_porfsx_extended.f
@@ -617,7 +617,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/sla_porpvgrw.f b/lapack-netlib/SRC/sla_porpvgrw.f
index a97e7f72a..8064bf7fa 100644
--- a/lapack-netlib/SRC/sla_porpvgrw.f
+++ b/lapack-netlib/SRC/sla_porpvgrw.f
@@ -132,9 +132,9 @@
 *
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     SPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     SPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/sla_syrfsx_extended.f b/lapack-netlib/SRC/sla_syrfsx_extended.f
index d5096be02..2fa3b1c50 100644
--- a/lapack-netlib/SRC/sla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/sla_syrfsx_extended.f
@@ -647,7 +647,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL SAXPY( N, 1.0, DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/slaed2.f b/lapack-netlib/SRC/slaed2.f
index 16500e74c..cadf53555 100644
--- a/lapack-netlib/SRC/slaed2.f
+++ b/lapack-netlib/SRC/slaed2.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+*       SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
 *                          Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *       .. Scalar Arguments ..
@@ -28,7 +28,7 @@
 *       .. Array Arguments ..
 *       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
 *      $                   INDXQ( * )
-*       REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   W( * ), Z( * )
 *       ..
 *
@@ -123,9 +123,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (N)
+*>          DLAMBDA is REAL array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         SLAED3 to form the secular equation.
 *> \endverbatim
@@ -148,7 +148,7 @@
 *> \param[out] INDX
 *> \verbatim
 *>          INDX is INTEGER array, dimension (N)
-*>         The permutation used to sort the contents of DLAMDA into
+*>         The permutation used to sort the contents of DLAMBDA into
 *>         ascending order.
 *> \endverbatim
 *>
@@ -207,7 +207,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMDA, W,
+      SUBROUTINE SLAED2( K, N, N1, D, Q, LDQ, INDXQ, RHO, Z, DLAMBDA, W,
      $                   Q2, INDX, INDXC, INDXP, COLTYP, INFO )
 *
 *  -- LAPACK computational routine --
@@ -221,7 +221,7 @@
 *     .. Array Arguments ..
       INTEGER            COLTYP( * ), INDX( * ), INDXC( * ), INDXP( * ),
      $                   INDXQ( * )
-      REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   W( * ), Z( * )
 *     ..
 *
@@ -300,9 +300,9 @@
 *     re-integrate the deflated parts from the last pass
 *
       DO 20 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
    20 CONTINUE
-      CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDXC )
+      CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDXC )
       DO 30 I = 1, N
          INDX( I ) = INDXQ( INDXC( I ) )
    30 CONTINUE
@@ -324,11 +324,11 @@
          DO 40 J = 1, N
             I = INDX( J )
             CALL SCOPY( N, Q( 1, I ), 1, Q2( IQ2 ), 1 )
-            DLAMDA( J ) = D( I )
+            DLAMBDA( J ) = D( I )
             IQ2 = IQ2 + N
    40    CONTINUE
          CALL SLACPY( 'A', N, N, Q2, N, Q, LDQ )
-         CALL SCOPY( N, DLAMDA, 1, D, 1 )
+         CALL SCOPY( N, DLAMBDA, 1, D, 1 )
          GO TO 190
       END IF
 *
@@ -421,7 +421,7 @@
             PJ = NJ
          ELSE
             K = K + 1
-            DLAMDA( K ) = D( PJ )
+            DLAMBDA( K ) = D( PJ )
             W( K ) = Z( PJ )
             INDXP( K ) = PJ
             PJ = NJ
@@ -433,7 +433,7 @@
 *     Record the last eigenvalue.
 *
       K = K + 1
-      DLAMDA( K ) = D( PJ )
+      DLAMBDA( K ) = D( PJ )
       W( K ) = Z( PJ )
       INDXP( K ) = PJ
 *
@@ -470,9 +470,9 @@
          PSM( CT ) = PSM( CT ) + 1
   130 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       I = 1
diff --git a/lapack-netlib/SRC/slaed3.f b/lapack-netlib/SRC/slaed3.f
index e84f22be1..44c601f91 100644
--- a/lapack-netlib/SRC/slaed3.f
+++ b/lapack-netlib/SRC/slaed3.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+*       SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
 *                          CTOT, W, S, INFO )
 *
 *       .. Scalar Arguments ..
@@ -27,7 +27,7 @@
 *       ..
 *       .. Array Arguments ..
 *       INTEGER            CTOT( * ), INDX( * )
-*       REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+*       REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
 *      $                   S( * ), W( * )
 *       ..
 *
@@ -44,12 +44,6 @@
 *> being combined by the matrix of eigenvectors of the K-by-K system
 *> which is solved here.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -104,14 +98,12 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in,out] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (K)
+*>          DLAMBDA is REAL array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
-*>          of the secular equation. May be changed on output by
-*>          having lowest order bit set to zero on Cray X-MP, Cray Y-MP,
-*>          Cray-2, or Cray C-90, as described above.
+*>          of the secular equation.
 *> \endverbatim
 *>
 *> \param[in] Q2
@@ -180,7 +172,7 @@
 *>  Modified by Francoise Tisseur, University of Tennessee
 *>
 *  =====================================================================
-      SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMDA, Q2, INDX,
+      SUBROUTINE SLAED3( K, N, N1, D, Q, LDQ, RHO, DLAMBDA, Q2, INDX,
      $                   CTOT, W, S, INFO )
 *
 *  -- LAPACK computational routine --
@@ -193,7 +185,7 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            CTOT( * ), INDX( * )
-      REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), Q2( * ),
+      REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), Q2( * ),
      $                   S( * ), W( * )
 *     ..
 *
@@ -208,8 +200,8 @@
       REAL               TEMP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMC3, SNRM2
-      EXTERNAL           SLAMC3, SNRM2
+      REAL               SNRM2
+      EXTERNAL           SNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGEMM, SLACPY, SLAED4, SLASET, XERBLA
@@ -239,30 +231,9 @@
 *
       IF( K.EQ.0 )
      $   RETURN
-*
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = 1, K
-         CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -293,10 +264,10 @@
       CALL SCOPY( K, Q, LDQ+1, W, 1 )
       DO 60 J = 1, K
          DO 40 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    40    CONTINUE
          DO 50 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
    60 CONTINUE
       DO 70 I = 1, K
diff --git a/lapack-netlib/SRC/slaed8.f b/lapack-netlib/SRC/slaed8.f
index 9c8ba440c..9dd8a15f6 100644
--- a/lapack-netlib/SRC/slaed8.f
+++ b/lapack-netlib/SRC/slaed8.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-*                          CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+*                          CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *       .. Scalar Arguments ..
@@ -30,7 +30,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+*       REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
 *      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *       ..
 *
@@ -141,9 +141,9 @@
 *>         process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (N)
+*>          DLAMBDA is REAL array, dimension (N)
 *>         A copy of the first K eigenvalues which will be used by
 *>         SLAED3 to form the secular equation.
 *> \endverbatim
@@ -238,7 +238,7 @@
 *
 *  =====================================================================
       SUBROUTINE SLAED8( ICOMPQ, K, N, QSIZ, D, Q, LDQ, INDXQ, RHO,
-     $                   CUTPNT, Z, DLAMDA, Q2, LDQ2, W, PERM, GIVPTR,
+     $                   CUTPNT, Z, DLAMBDA, Q2, LDQ2, W, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INDXP, INDX, INFO )
 *
 *  -- LAPACK computational routine --
@@ -253,7 +253,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      REAL               D( * ), DLAMDA( * ), GIVNUM( 2, * ),
+      REAL               D( * ), DLAMBDA( * ), GIVNUM( 2, * ),
      $                   Q( LDQ, * ), Q2( LDQ2, * ), W( * ), Z( * )
 *     ..
 *
@@ -339,14 +339,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL SLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL SLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -464,7 +464,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -476,26 +476,26 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   110 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       IF( ICOMPQ.EQ.0 ) THEN
          DO 120 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
   120    CONTINUE
       ELSE
          DO 130 J = 1, N
             JP = INDXP( J )
-            DLAMDA( J ) = D( JP )
+            DLAMBDA( J ) = D( JP )
             PERM( J ) = INDXQ( INDX( JP ) )
             CALL SCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   130    CONTINUE
@@ -506,9 +506,9 @@
 *
       IF( K.LT.N ) THEN
          IF( ICOMPQ.EQ.0 ) THEN
-            CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          ELSE
-            CALL SCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+            CALL SCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
             CALL SLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2,
      $                   Q( 1, K+1 ), LDQ )
          END IF
diff --git a/lapack-netlib/SRC/slaed9.f b/lapack-netlib/SRC/slaed9.f
index 4d07416e9..d1b7b29fd 100644
--- a/lapack-netlib/SRC/slaed9.f
+++ b/lapack-netlib/SRC/slaed9.f
@@ -18,15 +18,15 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-*                          S, LDS, INFO )
+*       SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+*                          W, S, LDS, INFO )
 *
 *       .. Scalar Arguments ..
 *       INTEGER            INFO, K, KSTART, KSTOP, LDQ, LDS, N
 *       REAL               RHO
 *       ..
 *       .. Array Arguments ..
-*       REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+*       REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
 *      $                   W( * )
 *       ..
 *
@@ -96,9 +96,9 @@
 *>          RHO >= 0 required.
 *> \endverbatim
 *>
-*> \param[in] DLAMDA
+*> \param[in] DLAMBDA
 *> \verbatim
-*>          DLAMDA is REAL array, dimension (K)
+*>          DLAMBDA is REAL array, dimension (K)
 *>          The first K elements of this array contain the old roots
 *>          of the deflated updating problem.  These are the poles
 *>          of the secular equation.
@@ -151,8 +151,8 @@
 *> at Berkeley, USA
 *
 *  =====================================================================
-      SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMDA, W,
-     $                   S, LDS, INFO )
+      SUBROUTINE SLAED9( K, KSTART, KSTOP, N, D, Q, LDQ, RHO, DLAMBDA,
+     $                   W, S, LDS, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -163,7 +163,7 @@
       REAL               RHO
 *     ..
 *     .. Array Arguments ..
-      REAL               D( * ), DLAMDA( * ), Q( LDQ, * ), S( LDS, * ),
+      REAL               D( * ), DLAMBDA( * ), Q( LDQ, * ), S( LDS, * ),
      $                   W( * )
 *     ..
 *
@@ -174,8 +174,8 @@
       REAL               TEMP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMC3, SNRM2
-      EXTERNAL           SLAMC3, SNRM2
+      REAL               SNRM2
+      EXTERNAL           SNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLAED4, XERBLA
@@ -212,30 +212,9 @@
 *
       IF( K.EQ.0 )
      $   RETURN
-*
-*     Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DLAMDA(I) by 2*DLAMDA(I)-DLAMDA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DLAMDA(I) if it is 1; this makes the subsequent
-*     subtractions DLAMDA(I)-DLAMDA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DLAMDA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DLAMDA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, N
-         DLAMDA( I ) = SLAMC3( DLAMDA( I ), DLAMDA( I ) ) - DLAMDA( I )
-   10 CONTINUE
 *
       DO 20 J = KSTART, KSTOP
-         CALL SLAED4( K, J, DLAMDA, W, Q( 1, J ), RHO, D( J ), INFO )
+         CALL SLAED4( K, J, DLAMBDA, W, Q( 1, J ), RHO, D( J ), INFO )
 *
 *        If the zero finder fails, the computation is terminated.
 *
@@ -261,10 +240,10 @@
       CALL SCOPY( K, Q, LDQ+1, W, 1 )
       DO 70 J = 1, K
          DO 50 I = 1, J - 1
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    50    CONTINUE
          DO 60 I = J + 1, K
-            W( I ) = W( I )*( Q( I, J ) / ( DLAMDA( I )-DLAMDA( J ) ) )
+            W( I ) = W( I )*( Q( I, J )/( DLAMBDA( I )-DLAMBDA( J ) ) )
    60    CONTINUE
    70 CONTINUE
       DO 80 I = 1, K
diff --git a/lapack-netlib/SRC/slals0.f b/lapack-netlib/SRC/slals0.f
index 7d44e2864..f168dc653 100644
--- a/lapack-netlib/SRC/slals0.f
+++ b/lapack-netlib/SRC/slals0.f
@@ -389,6 +389,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = POLES( I, 2 )*Z( I ) /
      $                           ( SLAMC3( POLES( I, 2 ), DSIGJ )-
      $                           DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -440,6 +445,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      WORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine SLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      WORK( I ) = Z( J ) / ( SLAMC3( DSIGJ, -POLES( I+1,
      $                           2 ) )-DIFR( I, 1 ) ) /
      $                           ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/slalsa.f b/lapack-netlib/SRC/slalsa.f
index 465455e4a..95becc76e 100644
--- a/lapack-netlib/SRC/slalsa.f
+++ b/lapack-netlib/SRC/slalsa.f
@@ -43,9 +43,9 @@
 *>
 *> \verbatim
 *>
-*> SLALSA is an itermediate step in solving the least squares problem
+*> SLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, SLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/slalsd.f b/lapack-netlib/SRC/slalsd.f
index 2197f728e..9943a52d9 100644
--- a/lapack-netlib/SRC/slalsd.f
+++ b/lapack-netlib/SRC/slalsd.f
@@ -47,12 +47,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f
index d4996b1f2..432afaded 100644
--- a/lapack-netlib/SRC/slamswlq.f
+++ b/lapack-netlib/SRC/slamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,33 +192,38 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE SLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      REAL               A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           STPMLQT, SGEMLQT, XERBLA
 *     ..
@@ -223,52 +231,60 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       INFO = 0
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -402,7 +418,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLAMSWLQ
diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f
index 960b794de..f9b167aea 100644
--- a/lapack-netlib/SRC/slamtsqr.f
+++ b/lapack-netlib/SRC/slamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,33 +193,38 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE SLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      REAL               A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMQRT, STPMQRT, XERBLA
 *     ..
@@ -225,12 +232,13 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'T' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +246,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +267,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN. AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL SGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -410,7 +424,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLAMTSQR
diff --git a/lapack-netlib/SRC/slaqp2rk.c b/lapack-netlib/SRC/slaqp2rk.c
new file mode 100644
index 000000000..0bfa71ab9
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp2rk.c
@@ -0,0 +1,918 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int slaqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, real *abstol, real *reltol, integer *kp1, 
+	real *maxc2nrm, real *a, integer *lda, integer *k, real *maxc2nrmk, 
+	real *relmaxc2nrmk, integer *jpiv, real *tau, real *vn1, real *vn2, 
+	real *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    real r__1, r__2;
+
+    /* Local variables */
+    real aikk, temp, temp2;
+    extern real snrm2_(integer *, real *, integer *);
+    integer i__, j;
+    real tol3z;
+    integer jmaxc2nrm;
+    extern /* Subroutine */ int slarf_(char *, integer *, integer *, real *, 
+	    integer *, real *, real *, integer *, real *);
+    integer itemp, minmnfact;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, 
+	    integer *);
+    real myhugeval;
+    integer minmnupdt, kk, kp;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, 
+	    real *);
+    extern integer isamax_(integer *, real *, integer *);
+    extern logical sisnan_(real *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + isamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = ZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    slarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    tau[kk] = 0.f;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since SLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by SLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by SLARFG is covered by checking TAU(KK) for NaN. */
+
+	if (sisnan_(&tau[kk])) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[kk];
+	    *relmaxc2nrmk = tau[kk];
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    aikk = a[i__ + kk * a_dim1];
+	    a[i__ + kk * a_dim1] = 1.f;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    slarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &tau[
+		    kk], &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    a[i__ + kk * a_dim1] = aikk;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    r__2 = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j];
+		    temp = 1.f - r__2 * r__2;
+		    temp = f2cmax(temp,0.f);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = snrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + isamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.f;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.f;
+	*relmaxc2nrmk = 0.f;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	tau[j] = 0.f;
+    }
+
+    return 0;
+
+/*     End of SLAQP2RK */
+
+} /* slaqp2rk_ */
+
diff --git a/lapack-netlib/SRC/slaqp2rk.f b/lapack-netlib/SRC/slaqp2rk.f
new file mode 100644
index 000000000..f88b0ce90
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp2rk.f
@@ -0,0 +1,713 @@
+*> \brief \b SLAQP2RK computes truncated QR factorization with column pivoting of a real matrix block using Level 2 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slaqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slaqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slaqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      REAL               A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+*     $                   WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of a real matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**T * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine SGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine SGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (N-1)
+*>          Used in SLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      REAL               A( LDA, * ), TAU( * ), VN1( * ), VN2( * ),
+     $                   WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      REAL               AIKK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLARF, SLARFG, SSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + ISAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to ZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL SSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = ZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL SLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = ZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since SLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by SLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by SLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( SISNAN( TAU(KK) ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( KK )
+            RELMAXC2NRMK = TAU( KK )
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**T to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = ZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = ONE
+            CALL SLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  TAU( KK ), A( I, KK+1 ), LDA, WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = SNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + ISAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to ZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = ZERO
+      END DO
+*
+      RETURN
+*
+*     End of SLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/slaqp3rk.c b/lapack-netlib/SRC/slaqp3rk.c
new file mode 100644
index 000000000..e3632538b
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp3rk.c
@@ -0,0 +1,1109 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static real c_b7 = -1.f;
+static real c_b8 = 1.f;
+static real c_b30 = 0.f;
+
+/* Subroutine */ int slaqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, real *abstol, real *reltol, integer *kp1, real 
+	*maxc2nrm, real *a, integer *lda, logical *done, integer *kb, real *
+	maxc2nrmk, real *relmaxc2nrmk, integer *jpiv, real *tau, real *vn1, 
+	real *vn2, real *auxv, real *f, integer *ldf, integer *iwork, integer 
+	*info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2;
+    real r__1, r__2;
+
+    /* Local variables */
+    real temp, temp2;
+    extern real snrm2_(integer *, real *, integer *);
+    integer i__, j, k;
+    real tol3z;
+    extern /* Subroutine */ int sgemm_(char *, char *, integer *, integer *, 
+	    integer *, real *, real *, integer *, real *, integer *, real *, 
+	    real *, integer *);
+    integer itemp;
+    extern /* Subroutine */ int sgemv_(char *, integer *, integer *, real *, 
+	    real *, integer *, real *, integer *, real *, real *, integer *);
+    integer minmnfact;
+    extern /* Subroutine */ int sswap_(integer *, real *, integer *, real *, 
+	    integer *);
+    real myhugeval;
+    integer minmnupdt, if__, kp;
+    extern real slamch_(char *);
+    extern /* Subroutine */ int slarfg_(integer *, real *, real *, integer *, 
+	    real *);
+    integer lsticc;
+    extern integer isamax_(integer *, real *, integer *);
+    extern logical sisnan_(real *);
+    real aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(slamch_("Epsilon"));
+    myhugeval = slamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + isamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (sisnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.f) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.f;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &
+			    c_b7, &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + 
+			    f_dim1], ldf, &c_b8, &a[if__ + 1 + (*kb + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    tau[j] = 0.f;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    sswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    sswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T. */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    sgemv_("No transpose", &i__1, &i__2, &c_b7, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b8, &a[i__ + k * a_dim1], &c__1);
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    slarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    tau[k] = 0.f;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since SLARFG cannot produce TAU(K) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by SLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by SLARFG is covered by checking TAU(K) for NaN. */
+
+	if (sisnan_(&tau[k])) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = tau[k];
+	    *relmaxc2nrmk = tau[k];
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		sgemm_("No transpose", "Transpose", &i__1, nrhs, kb, &c_b7, &
+			a[if__ + 1 + a_dim1], lda, &f[*n + 1 + f_dim1], ldf, &
+			c_b8, &a[if__ + 1 + (*n + 1) * a_dim1], lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	aik = a[i__ + k * a_dim1];
+	a[i__ + k * a_dim1] = 1.f;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    sgemv_("Transpose", &i__1, &i__2, &tau[k], &a[i__ + (k + 1) * 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b30, &f[k + 
+		    1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    f[j + k * f_dim1] = 0.f;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    r__1 = -tau[k];
+	    sgemv_("Transpose", &i__1, &i__2, &r__1, &a[i__ + a_dim1], lda, &
+		    a[i__ + k * a_dim1], &c__1, &c_b30, &auxv[1], &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    sgemv_("No transpose", &i__1, &i__2, &c_b8, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b8, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    sgemv_("No transpose", &i__1, &k, &c_b7, &f[k + 1 + f_dim1], ldf, 
+		    &a[i__ + a_dim1], lda, &c_b8, &a[i__ + (k + 1) * a_dim1], 
+		    lda);
+	}
+
+	a[i__ + k * a_dim1] = aik;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.f) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = (r__1 = a[i__ + j * a_dim1], abs(r__1)) / vn1[j];
+/* Computing MAX */
+		    r__1 = 0.f, r__2 = (temp + 1.f) * (1.f - temp);
+		    temp = f2cmax(r__1,r__2);
+/* Computing 2nd power */
+		    r__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (r__1 * r__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	sgemm_("No transpose", "Transpose", &i__1, &i__2, kb, &c_b7, &a[if__ 
+		+ 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b8, &a[if__ 
+		+ 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        SNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(SLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = snrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of SLAQP3RK */
+
+} /* slaqp3rk_ */
+
diff --git a/lapack-netlib/SRC/slaqp3rk.f b/lapack-netlib/SRC/slaqp3rk.f
new file mode 100644
index 000000000..b2dc2b334
--- /dev/null
+++ b/lapack-netlib/SRC/slaqp3rk.f
@@ -0,0 +1,935 @@
+*> \brief \b SLAQP3RK computes a step of truncated QR factorization with column pivoting of a real m-by-n matrix A using Level 3 BLAS and overwrites a real m-by-nrhs matrix B with Q**T * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slaqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slaqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slaqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*
+*     .. Scalar Arguments ..
+*      LOGICAL            DONE
+*      INTEGER            KB, LDA, LDF, M, N, NB, NRHS, IOFFSET
+*      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      REAL               A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+*     $                   VN1( * ), VN2( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a real M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**T * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is REAL, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is REAL, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine SGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is REAL
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine SGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the orthogonal matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**T.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is REAL
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is REAL
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is REAL array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is REAL array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is REAL array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is REAL array, dimension (LDF,NB)
+*>          Matrix F**T = L*(Y**T)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      REAL               ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      REAL               A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * ),
+     $                   VN1( * ), VN2( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      REAL               AIK, HUGEVAL, TEMP, TEMP2, TOL3Z
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGEMM, SGEMV, SLARFG, SSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            SISNAN
+      INTEGER            ISAMAX
+      REAL               SLAMCH, SNRM2
+      EXTERNAL           SISNAN, SLAMCH, ISAMAX, SNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( SLAMCH( 'Epsilon' ) )
+      HUGEVAL = SLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + ISAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( SISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL SGEMM( 'No transpose', 'Transpose',
+     $                  M-IF, N+NRHS-KB, KB,-ONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = ZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = ZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = ZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL SSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL SSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**T.
+*
+         IF( K.GT.1 ) THEN
+            CALL SGEMV( 'No transpose', M-I+1, K-1, -ONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, ONE, A( I, K ), 1 )
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL SLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = ZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since SLARFG cannot produce TAU(K) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by SLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by SLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( SISNAN( TAU(K) ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAU( K )
+            RELMAXC2NRMK = TAU( K )
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**T.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL SGEMM( 'No transpose', 'Transpose',
+     $               M-IF, NRHS, KB, -ONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, ONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = ONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**T * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL SGEMV( 'Transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  ZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = ZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**T
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL SGEMV( 'Transpose', M-I+1, K-1, -TAU( K ),
+     $                  A( I, 1 ), LDA, A( I, K ), 1, ZERO,
+     $                  AUXV( 1 ), 1 )
+*
+            CALL SGEMV( 'No transpose', N+NRHS, K-1, ONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, ONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**T.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL SGEMV( 'No transpose', N+NRHS-K, K, -ONE,
+     $                  F( K+1, 1 ), LDF, A( I, 1 ), LDA, ONE,
+     $                  A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**T.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL SGEMM( 'No transpose', 'Transpose',
+     $         M-IF, N+NRHS-KB, KB, -ONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, ONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        SNRM2 does not fail on vectors with norm below the value of
+*        SQRT(SLAMCH('S'))
+*
+         VN1( LSTICC ) = SNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of SLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/slaqr2.f b/lapack-netlib/SRC/slaqr2.f
index 62c4ef5eb..caf79fd1c 100644
--- a/lapack-netlib/SRC/slaqr2.f
+++ b/lapack-netlib/SRC/slaqr2.f
@@ -263,7 +263,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERauxiliary
+*> \ingroup laqr2
 *
 *> \par Contributors:
 *  ==================
@@ -305,11 +305,11 @@
       LOGICAL            BULGE, SORTED
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEHRD, SGEMM, SLABAD, SLACPY, SLAHQR,
+      EXTERNAL           SCOPY, SGEHRD, SGEMM, SLACPY, SLAHQR,
      $                   SLANV2, SLARF, SLARFG, SLASET, SORMHR, STREXC
 *     ..
 *     .. Intrinsic Functions ..
@@ -343,7 +343,7 @@
 *     ==== Quick return in case of workspace query. ====
 *
       IF( LWORK.EQ.-1 ) THEN
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -362,7 +362,6 @@
 *
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N ) / ULP )
 *
@@ -674,7 +673,7 @@
 *
 *      ==== Return optimal workspace. ====
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
 *     ==== End of SLAQR2 ====
 *
diff --git a/lapack-netlib/SRC/slaqr3.f b/lapack-netlib/SRC/slaqr3.f
index 519ccd6ed..d3ffb0f96 100644
--- a/lapack-netlib/SRC/slaqr3.f
+++ b/lapack-netlib/SRC/slaqr3.f
@@ -260,7 +260,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERauxiliary
+*> \ingroup laqr3
 *
 *> \par Contributors:
 *  ==================
@@ -302,14 +302,13 @@
       LOGICAL            BULGE, SORTED
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
       INTEGER            ILAENV
-      EXTERNAL           SLAMCH, ILAENV
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK, ILAENV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SGEHRD, SGEMM, SLABAD, SLACPY, SLAHQR,
-     $                   SLANV2, SLAQR4, SLARF, SLARFG, SLASET, SORMHR,
-     $                   STREXC
+      EXTERNAL           SCOPY, SGEHRD, SGEMM, SLACPY, SLAHQR, SLANV2,
+     $                   SLAQR4, SLARF, SLARFG, SLASET, SORMHR, STREXC
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, INT, MAX, MIN, REAL, SQRT
@@ -348,7 +347,7 @@
 *     ==== Quick return in case of workspace query. ====
 *
       IF( LWORK.EQ.-1 ) THEN
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          RETURN
       END IF
 *
@@ -367,7 +366,6 @@
 *
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N ) / ULP )
 *
@@ -685,7 +683,7 @@
 *
 *      ==== Return optimal workspace. ====
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
 *     ==== End of SLAQR3 ====
 *
diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f
index 1f0a51c85..d6721df97 100644
--- a/lapack-netlib/SRC/slaqr4.f
+++ b/lapack-netlib/SRC/slaqr4.f
@@ -239,7 +239,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERauxiliary
+*> \ingroup laqr4
 *
 *> \par Contributors:
 *  ==================
@@ -316,7 +316,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Local Arrays ..
       REAL               ZDUM( 1, 1 )
@@ -325,7 +326,7 @@
       EXTERNAL           SLACPY, SLAHQR, SLANV2, SLAQR2, SLAQR5
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          ABS, INT, MAX, MIN, MOD, REAL
+      INTRINSIC          ABS, INT, MAX, MIN, MOD
 *     ..
 *     .. Executable Statements ..
       INFO = 0
@@ -401,7 +402,7 @@
 *        ==== Quick return in case of workspace query. ====
 *
          IF( LWORK.EQ.-1 ) THEN
-            WORK( 1 ) = REAL( LWKOPT )
+            WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
             RETURN
          END IF
 *
@@ -732,7 +733,7 @@
 *
 *     ==== Return the optimal value of LWORK. ====
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
 *     ==== End of SLAQR4 ====
 *
diff --git a/lapack-netlib/SRC/slaqz0.f b/lapack-netlib/SRC/slaqz0.f
index 2e06f9d42..c128093e4 100644
--- a/lapack-netlib/SRC/slaqz0.f
+++ b/lapack-netlib/SRC/slaqz0.f
@@ -100,7 +100,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -294,7 +294,7 @@
 *
 *> \date May 2020
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup laqz0
 *>
 *  =====================================================================
       RECURSIVE SUBROUTINE SLAQZ0( WANTS, WANTQ, WANTZ, N, ILO, IHI, A,
@@ -329,9 +329,9 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET, SLABAD,
+      EXTERNAL :: XERBLA, SHGEQZ, SLAQZ3, SLAQZ4, SLASET,
      $            SLARTG, SROT
-      REAL, EXTERNAL :: SLAMCH, SLANHS
+      REAL, EXTERNAL :: SLAMCH, SLANHS, SROUNDUP_LWORK
       LOGICAL, EXTERNAL :: LSAME
       INTEGER, EXTERNAL :: ILAENV
 
@@ -461,7 +461,7 @@
 
       LWORKREQ = MAX( ITEMP1+2*NW**2, ITEMP2+2*NBR**2 )
       IF ( LWORK .EQ.-1 ) THEN
-         WORK( 1 ) = REAL( LWORKREQ )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKREQ )
          RETURN
       ELSE IF ( LWORK .LT. LWORKREQ ) THEN
          INFO = -19
@@ -479,7 +479,6 @@
 *     Get machine constants
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
@@ -564,7 +563,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/slaqz3.f b/lapack-netlib/SRC/slaqz3.f
index edb8a6012..979381364 100644
--- a/lapack-netlib/SRC/slaqz3.f
+++ b/lapack-netlib/SRC/slaqz3.f
@@ -228,7 +228,7 @@
 *
 *> \date May 2020
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup laqz3
 *>
 *  =====================================================================
       RECURSIVE SUBROUTINE SLAQZ3( ILSCHUR, ILQ, ILZ, N, ILO, IHI, NW,
@@ -258,9 +258,9 @@
       REAL :: S, SMLNUM, ULP, SAFMIN, SAFMAX, C1, S1, TEMP
 
 *     External Functions
-      EXTERNAL :: XERBLA, STGEXC, SLABAD, SLAQZ0, SLACPY, SLASET,
+      EXTERNAL :: XERBLA, STGEXC, SLAQZ0, SLACPY, SLASET,
      $            SLAQZ2, SROT, SLARTG, SLAG2, SGEMM
-      REAL, EXTERNAL :: SLAMCH
+      REAL, EXTERNAL :: SLAMCH, SROUNDUP_LWORK
 
       INFO = 0
 
@@ -286,7 +286,7 @@
       LWORKREQ = MAX( LWORKREQ, N*NW, 2*NW**2+N )
       IF ( LWORK .EQ.-1 ) THEN
 *        workspace query, quick return
-         WORK( 1 ) = LWORKREQ
+         WORK( 1 ) = SROUNDUP_LWORK(LWORKREQ)
          RETURN
       ELSE IF ( LWORK .LT. LWORKREQ ) THEN
          INFO = -26
@@ -300,7 +300,6 @@
 *     Get machine constants
       SAFMIN = SLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL SLABAD( SAFMIN, SAFMAX )
       ULP = SLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( REAL( N )/ULP )
 
diff --git a/lapack-netlib/SRC/slaqz4.f b/lapack-netlib/SRC/slaqz4.f
index 3c307dd47..95b2784c5 100644
--- a/lapack-netlib/SRC/slaqz4.f
+++ b/lapack-netlib/SRC/slaqz4.f
@@ -204,7 +204,7 @@
 *
 *> \date May 2020
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup laqz4
 *>
 *  =====================================================================
       SUBROUTINE SLAQZ4( ILSCHUR, ILQ, ILZ, N, ILO, IHI, NSHIFTS,
@@ -236,6 +236,7 @@
 *     External functions
       EXTERNAL :: XERBLA, SGEMM, SLAQZ1, SLAQZ2, SLASET, SLARTG, SROT,
      $            SLACPY
+      REAL, EXTERNAL :: SROUNDUP_LWORK
 
       INFO = 0
       IF ( NBLOCK_DESIRED .LT. NSHIFTS+1 ) THEN
@@ -243,7 +244,7 @@
       END IF
       IF ( LWORK .EQ.-1 ) THEN
 *        workspace query, quick return
-         WORK( 1 ) = N*NBLOCK_DESIRED
+         WORK( 1 ) = SROUNDUP_LWORK(N*NBLOCK_DESIRED)
          RETURN
       ELSE IF ( LWORK .LT. N*NBLOCK_DESIRED ) THEN
          INFO = -25
diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f
index 7719f2965..f1fdef790 100644
--- a/lapack-netlib/SRC/slarfb_gett.f
+++ b/lapack-netlib/SRC/slarfb_gett.f
@@ -451,7 +451,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/slarfgp.f b/lapack-netlib/SRC/slarfgp.f
index df42980c4..c28274c2c 100644
--- a/lapack-netlib/SRC/slarfgp.f
+++ b/lapack-netlib/SRC/slarfgp.f
@@ -97,7 +97,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERauxiliary
+*> \ingroup larfgp
 *
 *  =====================================================================
       SUBROUTINE SLARFGP( N, ALPHA, X, INCX, TAU )
@@ -122,7 +122,7 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            J, KNT
-      REAL               BETA, BIGNUM, SAVEALPHA, SMLNUM, XNORM
+      REAL               BETA, BIGNUM, EPS, SAVEALPHA, SMLNUM, XNORM
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH, SLAPY2, SNRM2
@@ -141,9 +141,10 @@
          RETURN
       END IF
 *
+      EPS = SLAMCH( 'Precision' )
       XNORM = SNRM2( N-1, X, INCX )
 *
-      IF( XNORM.EQ.ZERO ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) ) THEN
 *
 *        H  =  [+/-1, 0; I], sign chosen so ALPHA >= 0.
 *
diff --git a/lapack-netlib/SRC/slarre.f b/lapack-netlib/SRC/slarre.f
index 2e34ca5a6..e1b52c385 100644
--- a/lapack-netlib/SRC/slarre.f
+++ b/lapack-netlib/SRC/slarre.f
@@ -51,7 +51,7 @@
 *> SSTEMR to compute the eigenvectors of T.
 *> The accuracy varies depending on whether bisection is used to
 *> find a few eigenvalues or the dqds algorithm (subroutine SLASQ2) to
-*> conpute all and then discard any unwanted one.
+*> compute all and then discard any unwanted one.
 *> As an added benefit, SLARRE also outputs the n
 *> Gerschgorin intervals for the matrices L_i D_i L_i^T.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/slaruv.f b/lapack-netlib/SRC/slaruv.f
index c25dc2e2f..cd37a1c47 100644
--- a/lapack-netlib/SRC/slaruv.f
+++ b/lapack-netlib/SRC/slaruv.f
@@ -382,6 +382,11 @@
      $                   1537 /
 *     ..
 *     .. Executable Statements ..
+*
+*     Quick return for N < 1
+      IF ( N < 1 ) THEN
+         RETURN
+      END IF
 *
       I1 = ISEED( 1 )
       I2 = ISEED( 2 )
diff --git a/lapack-netlib/SRC/slas2.f b/lapack-netlib/SRC/slas2.f
index 6ae86204c..02ecbf434 100644
--- a/lapack-netlib/SRC/slas2.f
+++ b/lapack-netlib/SRC/slas2.f
@@ -93,9 +93,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows, or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows, or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/slasd0.f b/lapack-netlib/SRC/slasd0.f
index a45f741a9..c8a5c7838 100644
--- a/lapack-netlib/SRC/slasd0.f
+++ b/lapack-netlib/SRC/slasd0.f
@@ -79,10 +79,11 @@
 *>         On exit, E has been destroyed.
 *> \endverbatim
 *>
-*> \param[out] U
+*> \param[in,out] U
 *> \verbatim
 *>          U is REAL array, dimension (LDU, N)
-*>         On exit, U contains the left singular vectors.
+*>         On exit, U contains the left singular vectors,
+*>          if U passed in as (N, N) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDU
@@ -91,10 +92,11 @@
 *>         On entry, leading dimension of U.
 *> \endverbatim
 *>
-*> \param[out] VT
+*> \param[in,out] VT
 *> \verbatim
 *>          VT is REAL array, dimension (LDVT, M)
-*>         On exit, VT**T contains the right singular vectors.
+*>         On exit, VT**T contains the right singular vectors,
+*>          if VT passed in as (M, M) Identity.
 *> \endverbatim
 *>
 *> \param[in] LDVT
diff --git a/lapack-netlib/SRC/slasd3.f b/lapack-netlib/SRC/slasd3.f
index f9420f88a..8f74743c2 100644
--- a/lapack-netlib/SRC/slasd3.f
+++ b/lapack-netlib/SRC/slasd3.f
@@ -44,13 +44,6 @@
 *> appropriate calls to SLASD4 and then updates the singular
 *> vectors by matrix multiplication.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> SLASD3 is called from SLASD1.
 *> \endverbatim
 *
@@ -103,7 +96,7 @@
 *>         The leading dimension of the array Q.  LDQ >= K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is REAL array, dimension(K)
 *>         The first K elements of this array contain the old roots
@@ -249,8 +242,8 @@
       REAL               RHO, TEMP
 *     ..
 *     .. External Functions ..
-      REAL               SLAMC3, SNRM2
-      EXTERNAL           SLAMC3, SNRM2
+      REAL               SNRM2
+      EXTERNAL           SNRM2
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGEMM, SLACPY, SLASCL, SLASD4, XERBLA
@@ -310,27 +303,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DSIGMA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 20 I = 1, K
-         DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   20 CONTINUE
-*
 *     Keep a copy of Z.
 *
       CALL SCOPY( K, Z, 1, Q, 1 )
diff --git a/lapack-netlib/SRC/slasd8.f b/lapack-netlib/SRC/slasd8.f
index 43b171e5f..df5002367 100644
--- a/lapack-netlib/SRC/slasd8.f
+++ b/lapack-netlib/SRC/slasd8.f
@@ -121,14 +121,12 @@
 *>          The leading dimension of DIFR, must be at least K.
 *> \endverbatim
 *>
-*> \param[in,out] DSIGMA
+*> \param[in] DSIGMA
 *> \verbatim
 *>          DSIGMA is REAL array, dimension ( K )
 *>          On entry, the first K elements of this array contain the old
 *>          roots of the deflated updating problem.  These are the poles
 *>          of the secular equation.
-*>          On exit, the elements of DSIGMA may be very slightly altered
-*>          in value.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -227,27 +225,6 @@
          RETURN
       END IF
 *
-*     Modify values DSIGMA(i) to make sure all DSIGMA(i)-DSIGMA(j) can
-*     be computed with high relative accuracy (barring over/underflow).
-*     This is a problem on machines without a guard digit in
-*     add/subtract (Cray XMP, Cray YMP, Cray C 90 and Cray 2).
-*     The following code replaces DSIGMA(I) by 2*DSIGMA(I)-DSIGMA(I),
-*     which on any of these machines zeros out the bottommost
-*     bit of DSIGMA(I) if it is 1; this makes the subsequent
-*     subtractions DSIGMA(I)-DSIGMA(J) unproblematic when cancellation
-*     occurs. On binary machines with a guard digit (almost all
-*     machines) it does not change DSIGMA(I) at all. On hexadecimal
-*     and decimal machines with a guard digit, it slightly
-*     changes the bottommost bits of DSIGMA(I). It does not account
-*     for hexadecimal or decimal machines without guard digits
-*     (we know of none). We use a subroutine call to compute
-*     2*DLAMBDA(I) to prevent optimizing compilers from eliminating
-*     this code.
-*
-      DO 10 I = 1, K
-         DSIGMA( I ) = SLAMC3( DSIGMA( I ), DSIGMA( I ) ) - DSIGMA( I )
-   10 CONTINUE
-*
 *     Book keeping.
 *
       IWK1 = 1
@@ -312,6 +289,11 @@
             DSIGJP = -DSIGMA( J+1 )
          END IF
          WORK( J ) = -Z( J ) / DIFLJ / ( DSIGMA( J )+DJ )
+*
+*        Use calls to the subroutine SLAMC3 to enforce the parentheses
+*        (x+y)+z. The goal is to prevent optimizing compilers
+*        from doing x+(y+z).
+*
          DO 60 I = 1, J - 1
             WORK( I ) = Z( I ) / ( SLAMC3( DSIGMA( I ), DSIGJ )-DIFLJ )
      $                   / ( DSIGMA( I )+DJ )
diff --git a/lapack-netlib/SRC/slassq.f90 b/lapack-netlib/SRC/slassq.f90
index 19f49402b..c8959f4a7 100644
--- a/lapack-netlib/SRC/slassq.f90
+++ b/lapack-netlib/SRC/slassq.f90
@@ -34,28 +34,15 @@
 !>
 !> \verbatim
 !>
-!> SLASSQ  returns the values  scl  and  smsq  such that
+!> SLASSQ returns the values scale_out and sumsq_out such that
 !>
-!>    ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+!>    (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq,
 !>
-!> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is
 !> assumed to be non-negative.
 !>
 !> scale and sumsq must be supplied in SCALE and SUMSQ and
-!> scl and smsq are overwritten on SCALE and SUMSQ respectively.
-!>
-!> If scale * sqrt( sumsq ) > tbig then
-!>    we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
-!> and if 0 < scale * sqrt( sumsq ) < tsml then
-!>    we require:   scale <= sqrt( HUGE ) / ssml       on entry,
-!> where
-!>    tbig -- upper threshold for values whose square is representable;
-!>    sbig -- scaling constant for big numbers; \see la_constants.f90
-!>    tsml -- lower threshold for values whose square is representable;
-!>    ssml -- scaling constant for small numbers; \see la_constants.f90
-!> and
-!>    TINY*EPS -- tiniest representable number;
-!>    HUGE     -- biggest representable number.
+!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively.
 !>
 !> \endverbatim
 !
@@ -72,7 +59,7 @@
 !> \verbatim
 !>          X is REAL array, dimension (1+(N-1)*abs(INCX))
 !>          The vector for which a scaled sum of squares is computed.
-!>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+!>             x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 !> \endverbatim
 !>
 !> \param[in] INCX
@@ -82,24 +69,24 @@
 !>          If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX = 0, x isn't a vector so there is no need to call
-!>          this subroutine.  If you call it anyway, it will count x(1)
+!>          this subroutine. If you call it anyway, it will count x(1)
 !>          in the vector norm N times.
 !> \endverbatim
 !>
 !> \param[in,out] SCALE
 !> \verbatim
 !>          SCALE is REAL
-!>          On entry, the value  scale  in the equation above.
-!>          On exit, SCALE is overwritten with  scl , the scaling factor
+!>          On entry, the value scale in the equation above.
+!>          On exit, SCALE is overwritten by scale_out, the scaling factor
 !>          for the sum of squares.
 !> \endverbatim
 !>
 !> \param[in,out] SUMSQ
 !> \verbatim
 !>          SUMSQ is REAL
-!>          On entry, the value  sumsq  in the equation above.
-!>          On exit, SUMSQ is overwritten with  smsq , the basic sum of
-!>          squares from which  scl  has been factored out.
+!>          On entry, the value sumsq in the equation above.
+!>          On exit, SUMSQ is overwritten by sumsq_out, the basic sum of
+!>          squares from which scale_out has been factored out.
 !> \endverbatim
 !
 !  Authors:
@@ -130,10 +117,10 @@
 !>
 !> \endverbatim
 !
-!> \ingroup OTHERauxiliary
+!> \ingroup lassq
 !
 !  =====================================================================
-subroutine SLASSQ( n, x, incx, scl, sumsq )
+subroutine SLASSQ( n, x, incx, scale, sumsq )
    use LA_CONSTANTS, &
       only: wp=>sp, zero=>szero, one=>sone, &
             sbig=>ssbig, ssml=>sssml, tbig=>stbig, tsml=>stsml
@@ -145,7 +132,7 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
 !
 !  .. Scalar Arguments ..
    integer :: incx, n
-   real(wp) :: scl, sumsq
+   real(wp) :: scale, sumsq
 !  ..
 !  .. Array Arguments ..
    real(wp) :: x(*)
@@ -158,10 +145,10 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
 !
 !  Quick return if possible
 !
-   if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return
-   if( sumsq == zero ) scl = one
-   if( scl == zero ) then
-      scl = one
+   if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return
+   if( sumsq == zero ) scale = one
+   if( scale == zero ) then
+      scale = one
       sumsq = zero
    end if
    if (n <= 0) then
@@ -198,15 +185,27 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
 !  Put the existing sum of squares into one of the accumulators
 !
    if( sumsq > zero ) then
-      ax = scl*sqrt( sumsq )
+      ax = scale*sqrt( sumsq )
       if (ax > tbig) then
-!        We assume scl >= sqrt( TINY*EPS ) / sbig
-         abig = abig + (scl*sbig)**2 * sumsq
+         if (scale > one) then
+            scale = scale * sbig
+            abig = abig + scale * (scale * sumsq)
+         else
+            ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable
+            abig = abig + scale * (scale * (sbig * (sbig * sumsq)))
+         end if
       else if (ax < tsml) then
-!        We assume scl <= sqrt( HUGE ) / ssml
-         if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+         if (notbig) then
+            if (scale < one) then
+               scale = scale * ssml
+               asml = asml + scale * (scale * sumsq)
+            else
+               ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable
+               asml = asml + scale * (scale * (ssml * (ssml * sumsq)))
+            end if
+         end if
       else
-         amed = amed + scl**2 * sumsq
+         amed = amed + scale * (scale * sumsq)
       end if
    end if
 !
@@ -220,7 +219,7 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
       if (amed > zero .or. LA_ISNAN(amed)) then
          abig = abig + (amed*sbig)*sbig
       end if
-      scl = one / sbig
+      scale = one / sbig
       sumsq = abig
    else if (asml > zero) then
 !
@@ -236,17 +235,17 @@ subroutine SLASSQ( n, x, incx, scl, sumsq )
             ymin = asml
             ymax = amed
          end if
-         scl = one
+         scale = one
          sumsq = ymax**2*( one + (ymin/ymax)**2 )
       else
-         scl = one / ssml
+         scale = one / ssml
          sumsq = asml
       end if
    else
 !
 !     Otherwise all values are mid-range or zero
 !
-      scl = one
+      scale = one
       sumsq = amed
    end if
    return
diff --git a/lapack-netlib/SRC/slasv2.f b/lapack-netlib/SRC/slasv2.f
index 6b98e9a0c..bf5d3ea0e 100644
--- a/lapack-netlib/SRC/slasv2.f
+++ b/lapack-netlib/SRC/slasv2.f
@@ -124,9 +124,7 @@
 *>  infinite.
 *>
 *>  Overflow will not occur unless the largest singular value itself
-*>  overflows or is within a few ulps of overflow. (On machines with
-*>  partial overflow, like the Cray, overflow may occur if the largest
-*>  singular value is within a factor of 2 of overflow.)
+*>  overflows or is within a few ulps of overflow.
 *>
 *>  Underflow is harmless if underflow is gradual. Otherwise, results
 *>  may correspond to a matrix modified by perturbations of size near
diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f
index bc7cf343d..594c646db 100644
--- a/lapack-netlib/SRC/slaswlq.f
+++ b/lapack-netlib/SRC/slaswlq.f
@@ -96,21 +96,24 @@
 *>          The leading dimension of the array T.  LDT >= MB.
 *> \endverbatim
 *>
-*>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
-*>          The dimension of the array WORK.  LWORK >= MB * M.
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -158,33 +161,39 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      REAL              A( LDA, * ), WORK( * ), T( LDT, *)
+      REAL               A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           SGELQT, SGEQRT, STPLQT, STPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -195,12 +204,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LE.0 ) THEN
         INFO = -4
@@ -208,60 +224,60 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
-        CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
+        CALL SGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL SGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M, (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
-     $                  LDA, T(1, CTR * M + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL STPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+     $                 LDA, T(1, CTR * M + 1),
+     $                 LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL STPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
-     $                  LDA, T(1, CTR * M + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                 LDA, T(1, CTR * M + 1), LDT,
+     $                 WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = M * MB
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLASWLQ
diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f
index 0761d656f..9765ea3d7 100644
--- a/lapack-netlib/SRC/slatrs.f
+++ b/lapack-netlib/SRC/slatrs.f
@@ -261,6 +261,9 @@
       REAL               BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS,
      $                   TMAX, TSCAL, USCAL, XBND, XJ, XMAX
 *     ..
+*     .. Local Arrays ..
+      REAL               WORK (1)
+*     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ISAMAX
@@ -362,7 +365,7 @@
 *              A is upper triangular.
 *
                DO J = 2, N
-                  TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ),
+                  TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ),
      $                        TMAX )
                END DO
             ELSE
@@ -371,7 +374,7 @@
 *
                DO J = 1, N - 1
                   TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1,
-     $                        SUMJ ), TMAX )
+     $                        WORK ), TMAX )
                END DO
             END IF
 *
diff --git a/lapack-netlib/SRC/slatrs3.f b/lapack-netlib/SRC/slatrs3.f
index c3a08e524..17052289e 100644
--- a/lapack-netlib/SRC/slatrs3.f
+++ b/lapack-netlib/SRC/slatrs3.f
@@ -151,13 +151,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (LWORK).
-*>          On exit, if INFO = 0, WORK(1) returns the optimal size of
-*>          WORK.
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -165,6 +168,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -181,7 +185,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -253,7 +257,7 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       REAL               ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
@@ -264,7 +268,8 @@
       EXTERNAL           ILAENV, LSAME, SLAMCH, SLANGE, SLARMM
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLATRS, SSCAL, XERBLA
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SLATRS, SSCAL, SROUNDUP_LWORK, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
@@ -292,15 +297,24 @@
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
 *     Test the input parameters.
 *
@@ -322,7 +336,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -574,7 +588,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2-K1
@@ -650,6 +664,8 @@
          END DO
       END DO
       RETURN
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
 *
 *     End of SLATRS3
 *
diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f
index 33966c01f..4730815b5 100644
--- a/lapack-netlib/SRC/slatsqr.f
+++ b/lapack-netlib/SRC/slatsqr.f
@@ -101,14 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          (workspace) REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -160,33 +164,39 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      REAL  A( LDA, * ), WORK( * ), T(LDT, *)
+      REAL               A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           SGEQRT, STPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -197,6 +207,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -204,64 +221,65 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'SLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL SGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CALL SGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
 *
-       CTR = 1
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      CTR = 1
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
-     $                 T(1, CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL STPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+     $                T(1, CTR * N + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
-     $                 T(1, CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+      IF( II.LE.M ) THEN
+        CALL STPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+     $                T(1, CTR * N + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      work( 1 ) = N*NB
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SLATSQR
diff --git a/lapack-netlib/SRC/sorbdb1.f b/lapack-netlib/SRC/sorbdb1.f
index c860f4366..191e5742a 100644
--- a/lapack-netlib/SRC/sorbdb1.f
+++ b/lapack-netlib/SRC/sorbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> SORBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb2.f b/lapack-netlib/SRC/sorbdb2.f
index 484d352f8..b2ff34bb1 100644
--- a/lapack-netlib/SRC/sorbdb2.f
+++ b/lapack-netlib/SRC/sorbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> SORBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb3.f b/lapack-netlib/SRC/sorbdb3.f
index 6209b24ee..99478c5d0 100644
--- a/lapack-netlib/SRC/sorbdb3.f
+++ b/lapack-netlib/SRC/sorbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> SORBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb4.f b/lapack-netlib/SRC/sorbdb4.f
index bf60fb7bb..0fef5b759 100644
--- a/lapack-netlib/SRC/sorbdb4.f
+++ b/lapack-netlib/SRC/sorbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> SORBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/sorbdb5.f b/lapack-netlib/SRC/sorbdb5.f
index 8c67aedfb..8fb88876f 100644
--- a/lapack-netlib/SRC/sorbdb5.f
+++ b/lapack-netlib/SRC/sorbdb5.f
@@ -148,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unbdb5
 *
 *  =====================================================================
       SUBROUTINE SORBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -169,18 +169,21 @@
 *  =====================================================================
 *
 *     .. Parameters ..
+      REAL               REALZERO
+      PARAMETER          ( REALZERO = 0.0E0 )
       REAL               ONE, ZERO
       PARAMETER          ( ONE = 1.0E0, ZERO = 0.0E0 )
 *     ..
 *     .. Local Scalars ..
       INTEGER            CHILDINFO, I, J
+      REAL               EPS, NORM, SCL, SSQ
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SORBDB6, XERBLA
+      EXTERNAL           SLASSQ, SORBDB6, SSCAL, XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SNRM2
-      EXTERNAL           SNRM2
+      REAL               SLAMCH, SNRM2
+      EXTERNAL           SLAMCH, SNRM2
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          MAX
@@ -213,16 +216,33 @@
          RETURN
       END IF
 *
-*     Project X onto the orthogonal complement of Q
+      EPS = SLAMCH( 'Precision' )
 *
-      CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2,
-     $              WORK, LWORK, CHILDINFO )
+*     Project X onto the orthogonal complement of Q if X is nonzero
 *
-*     If the projection is nonzero, then return
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL SLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL SLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
 *
-      IF( SNRM2(M1,X1,INCX1) .NE. ZERO
-     $    .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
-         RETURN
+      IF( NORM .GT. N * EPS ) THEN
+*        Scale vector to unit norm to avoid problems in the caller code.
+*        Computing the reciprocal is undesirable but
+*         * xLASCL cannot be used because of the vector increments and
+*         * the round-off error has a negligible impact on
+*           orthogonalization.
+         CALL SSCAL( M1, ONE / NORM, X1, INCX1 )
+         CALL SSCAL( M2, ONE / NORM, X2, INCX2 )
+         CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
+     $              LDQ2, WORK, LWORK, CHILDINFO )
+*
+*        If the projection is nonzero, then return
+*
+         IF( SNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
+            RETURN
+         END IF
       END IF
 *
 *     Project each standard basis vector e_1,...,e_M1 in turn, stopping
@@ -238,8 +258,8 @@
          END DO
          CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( SNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( SNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
@@ -257,8 +277,8 @@
          X2(I) = ONE
          CALL SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( SNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. SNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( SNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. SNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
diff --git a/lapack-netlib/SRC/sorbdb6.f b/lapack-netlib/SRC/sorbdb6.f
index b2449e3be..eac177722 100644
--- a/lapack-netlib/SRC/sorbdb6.f
+++ b/lapack-netlib/SRC/sorbdb6.f
@@ -41,9 +41,8 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The Euclidean norm of X must be one and the columns of Q must be
-*> orthonormal. The orthogonalized vector will be zero if and only if it
-*> lies entirely in the range of Q.
+*> The columns of Q must be orthonormal. The orthogonalized vector will
+*> be zero if and only if it lies entirely in the range of Q.
 *>
 *> The projection is computed with at most two iterations of the
 *> classical Gram-Schmidt algorithm, see
@@ -152,7 +151,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unbdb6
 *
 *  =====================================================================
       SUBROUTINE SORBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -174,7 +173,7 @@
 *
 *     .. Parameters ..
       REAL               ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.01E0, REALONE = 1.0E0,
+      PARAMETER          ( ALPHA = 0.83E0, REALONE = 1.0E0,
      $                     REALZERO = 0.0E0 )
       REAL               NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = -1.0E0, ONE = 1.0E0, ZERO = 0.0E0 )
@@ -222,14 +221,16 @@
 *
       EPS = SLAMCH( 'Precision' )
 *
+*     Compute the Euclidean norm of X
+*
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL SLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL SLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
+*
 *     First, project X onto the orthogonal complement of Q's column
 *     space
-*
-*     Christoph Conrads: In debugging mode the norm should be computed
-*     and an assertion added comparing the norm with one. Alas, Fortran
-*     never made it into 1989 when assert() was introduced into the C
-*     programming language.
-      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f
index b1a5c03a2..46f4ab130 100644
--- a/lapack-netlib/SRC/sorgbr.f
+++ b/lapack-netlib/SRC/sorgbr.f
@@ -150,7 +150,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGBcomputational
+*> \ingroup ungbr
 *
 *  =====================================================================
       SUBROUTINE SORGBR( VECT, M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -179,7 +179,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORGLQ, SORGQR, XERBLA
@@ -240,7 +241,7 @@
          CALL XERBLA( 'SORGBR', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
          RETURN
       END IF
 *
@@ -326,7 +327,7 @@
             END IF
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORGBR
diff --git a/lapack-netlib/SRC/sorghr.f b/lapack-netlib/SRC/sorghr.f
index f65cd898c..624ede282 100644
--- a/lapack-netlib/SRC/sorghr.f
+++ b/lapack-netlib/SRC/sorghr.f
@@ -119,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unghr
 *
 *  =====================================================================
       SUBROUTINE SORGHR( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO )
@@ -150,7 +150,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -177,7 +178,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'SORGQR', ' ', NH, NH, NH, -1 )
          LWKOPT = MAX( 1, NH )*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -229,7 +230,7 @@
          CALL SORGQR( NH, NH, NH, A( ILO+1, ILO+1 ), LDA, TAU( ILO ),
      $                WORK, LWORK, IINFO )
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORGHR
diff --git a/lapack-netlib/SRC/sorglq.f b/lapack-netlib/SRC/sorglq.f
index b1d107964..30f6d5d48 100644
--- a/lapack-netlib/SRC/sorglq.f
+++ b/lapack-netlib/SRC/sorglq.f
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unglq
 *
 *  =====================================================================
       SUBROUTINE SORGLQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -155,7 +155,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -164,7 +165,7 @@
       INFO = 0
       NB = ILAENV( 1, 'SORGLQ', ' ', M, N, K, -1 )
       LWKOPT = MAX( 1, M )*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -278,7 +279,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SORGLQ
diff --git a/lapack-netlib/SRC/sorgql.f b/lapack-netlib/SRC/sorgql.f
index 34ab5edef..f104e64b2 100644
--- a/lapack-netlib/SRC/sorgql.f
+++ b/lapack-netlib/SRC/sorgql.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ungql
 *
 *  =====================================================================
       SUBROUTINE SORGQL( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,7 +182,7 @@
             NB = ILAENV( 1, 'SORGQL', ' ', M, N, K, -1 )
             LWKOPT = N*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
@@ -285,7 +286,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SORGQL
diff --git a/lapack-netlib/SRC/sorgqr.f b/lapack-netlib/SRC/sorgqr.f
index 056de54d7..a87ea6c65 100644
--- a/lapack-netlib/SRC/sorgqr.f
+++ b/lapack-netlib/SRC/sorgqr.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ungqr
 *
 *  =====================================================================
       SUBROUTINE SORGQR( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -165,7 +166,7 @@
       INFO = 0
       NB = ILAENV( 1, 'SORGQR', ' ', M, N, K, -1 )
       LWKOPT = MAX( 1, N )*NB
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -279,7 +280,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SORGQR
diff --git a/lapack-netlib/SRC/sorgrq.f b/lapack-netlib/SRC/sorgrq.f
index d9b6ccbe6..331f20904 100644
--- a/lapack-netlib/SRC/sorgrq.f
+++ b/lapack-netlib/SRC/sorgrq.f
@@ -121,7 +121,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ungrq
 *
 *  =====================================================================
       SUBROUTINE SORGRQ( M, N, K, A, LDA, TAU, WORK, LWORK, INFO )
@@ -156,7 +156,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -181,7 +182,7 @@
             NB = ILAENV( 1, 'SORGRQ', ' ', M, N, K, -1 )
             LWKOPT = M*NB
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
             INFO = -8
@@ -285,7 +286,7 @@
    50    CONTINUE
       END IF
 *
-      WORK( 1 ) = IWS
+      WORK( 1 ) = SROUNDUP_LWORK(IWS)
       RETURN
 *
 *     End of SORGRQ
diff --git a/lapack-netlib/SRC/sorgtr.f b/lapack-netlib/SRC/sorgtr.f
index 67bde00cb..6a1dc3034 100644
--- a/lapack-netlib/SRC/sorgtr.f
+++ b/lapack-netlib/SRC/sorgtr.f
@@ -116,7 +116,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup ungtr
 *
 *  =====================================================================
       SUBROUTINE SORGTR( UPLO, N, A, LDA, TAU, WORK, LWORK, INFO )
@@ -146,7 +146,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORGQL, SORGQR, XERBLA
@@ -178,7 +179,7 @@
            NB = ILAENV( 1, 'SORGQR', ' ', N-1, N-1, N-1, -1 )
          END IF
          LWKOPT = MAX( 1, N-1 )*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -244,7 +245,7 @@
      $                   LWORK, IINFO )
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORGTR
diff --git a/lapack-netlib/SRC/sorgtsqr.f b/lapack-netlib/SRC/sorgtsqr.f
index a755fa4a4..0be27af77 100644
--- a/lapack-netlib/SRC/sorgtsqr.f
+++ b/lapack-netlib/SRC/sorgtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -156,7 +157,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleOTHERcomputational
+*> \ingroup ungtsqr
 *
 *> \par Contributors:
 *  ==================
@@ -195,11 +196,15 @@
       LOGICAL            LQUERY
       INTEGER            IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLAMTSQR, SLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -256,14 +261,14 @@
          CALL XERBLA( 'SORGTSQR', -INFO )
          RETURN
       ELSE IF ( LQUERY ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
       IF( MIN( M, N ).EQ.0 ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
@@ -296,9 +301,9 @@
          CALL SCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 )
       END DO
 *
-      WORK( 1 ) = REAL( LWORKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
       RETURN
 *
 *     End of SORGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/SRC/sorgtsqr_row.f b/lapack-netlib/SRC/sorgtsqr_row.f
index d2a2150cd..5a1e1ff07 100644
--- a/lapack-netlib/SRC/sorgtsqr_row.f
+++ b/lapack-netlib/SRC/sorgtsqr_row.f
@@ -169,7 +169,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup sigleOTHERcomputational
+*> \ingroup ungtsqr_row
 *
 *> \par Contributors:
 *  ==================
@@ -213,11 +213,15 @@
 *     .. Local Arrays ..
       REAL               DUMMY( 1, 1 )
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB_GETT, SLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -255,14 +259,14 @@
          CALL XERBLA( 'SORGTSQR_ROW', -INFO )
          RETURN
       ELSE IF ( LQUERY ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
       IF( MIN( M, N ).EQ.0 ) THEN
-         WORK( 1 ) = REAL( LWORKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
          RETURN
       END IF
 *
@@ -371,7 +375,7 @@
 *
       END DO
 *
-      WORK( 1 ) = REAL( LWORKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWORKOPT )
       RETURN
 *
 *     End of SORGTSQR_ROW
diff --git a/lapack-netlib/SRC/sorm22.f b/lapack-netlib/SRC/sorm22.f
index 15096870a..886adb2cf 100644
--- a/lapack-netlib/SRC/sorm22.f
+++ b/lapack-netlib/SRC/sorm22.f
@@ -155,7 +155,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexOTHERcomputational
+*> \ingroup unm22
 *
 *  =====================================================================
       SUBROUTINE SORM22( SIDE, TRANS, M, N, N1, N2, Q, LDQ, C, LDC,
@@ -187,13 +187,14 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, STRMM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          REAL, MAX, MIN
+      INTRINSIC          MAX, MIN
 *     ..
 *     .. Executable Statements ..
 *
@@ -237,7 +238,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          LWKOPT = M*N
-         WORK( 1 ) = REAL( LWKOPT )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -430,7 +431,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = REAL( LWKOPT )
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SORM22
diff --git a/lapack-netlib/SRC/sormbr.f b/lapack-netlib/SRC/sormbr.f
index efe5be41a..e2dccc363 100644
--- a/lapack-netlib/SRC/sormbr.f
+++ b/lapack-netlib/SRC/sormbr.f
@@ -188,7 +188,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmbr
 *
 *  =====================================================================
       SUBROUTINE SORMBR( VECT, SIDE, TRANS, M, N, K, A, LDA, TAU, C,
@@ -217,7 +217,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORMLQ, SORMQR, XERBLA
@@ -285,7 +286,7 @@
             END IF
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -363,7 +364,7 @@
      $                   TAU, C( I1, I2 ), LDC, WORK, LWORK, IINFO )
          END IF
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMBR
diff --git a/lapack-netlib/SRC/sormhr.f b/lapack-netlib/SRC/sormhr.f
index 2d2053af4..e033feb38 100644
--- a/lapack-netlib/SRC/sormhr.f
+++ b/lapack-netlib/SRC/sormhr.f
@@ -171,7 +171,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmhr
 *
 *  =====================================================================
       SUBROUTINE SORMHR( SIDE, TRANS, M, N, ILO, IHI, A, LDA, TAU, C,
@@ -199,7 +199,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORMQR, XERBLA
@@ -253,7 +254,7 @@
             NB = ILAENV( 1, 'SORMQR', SIDE // TRANS, M, NH, NH, -1 )
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -285,7 +286,7 @@
       CALL SORMQR( SIDE, TRANS, MI, NI, NH, A( ILO+1, ILO ), LDA,
      $             TAU( ILO ), C( I1, I2 ), LDC, WORK, LWORK, IINFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMHR
diff --git a/lapack-netlib/SRC/sormlq.f b/lapack-netlib/SRC/sormlq.f
index ee996e560..1a32568b6 100644
--- a/lapack-netlib/SRC/sormlq.f
+++ b/lapack-netlib/SRC/sormlq.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmlq
 *
 *  =====================================================================
       SUBROUTINE SORMLQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -195,7 +195,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB, SLARFT, SORML2, XERBLA
@@ -246,7 +247,7 @@
          NB = MIN( NBMAX, ILAENV( 1, 'SORMLQ', SIDE // TRANS, M, N, K,
      $             -1 ) )
          LWKOPT = NW*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -338,7 +339,7 @@
      $                   C( IC, JC ), LDC, WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMLQ
diff --git a/lapack-netlib/SRC/sormql.f b/lapack-netlib/SRC/sormql.f
index 72a8d22ee..9564d4141 100644
--- a/lapack-netlib/SRC/sormql.f
+++ b/lapack-netlib/SRC/sormql.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmql
 *
 *  =====================================================================
       SUBROUTINE SORMQL( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -194,7 +194,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB, SLARFT, SORM2L, XERBLA
@@ -249,7 +250,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -330,7 +331,7 @@
      $                   WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMQL
diff --git a/lapack-netlib/SRC/sormqr.f b/lapack-netlib/SRC/sormqr.f
index 5d4256f09..adb1203df 100644
--- a/lapack-netlib/SRC/sormqr.f
+++ b/lapack-netlib/SRC/sormqr.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmqr
 *
 *  =====================================================================
       SUBROUTINE SORMQR( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -194,7 +194,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB, SLARFT, SORM2R, XERBLA
@@ -245,7 +246,7 @@
          NB = MIN( NBMAX, ILAENV( 1, 'SORMQR', SIDE // TRANS, M, N, K,
      $        -1 ) )
          LWKOPT = NW*NB + TSIZE
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -331,7 +332,7 @@
      $                   C( IC, JC ), LDC, WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMQR
diff --git a/lapack-netlib/SRC/sormrq.f b/lapack-netlib/SRC/sormrq.f
index 62fcdacdb..f091f0507 100644
--- a/lapack-netlib/SRC/sormrq.f
+++ b/lapack-netlib/SRC/sormrq.f
@@ -160,7 +160,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmrq
 *
 *  =====================================================================
       SUBROUTINE SORMRQ( SIDE, TRANS, M, N, K, A, LDA, TAU, C, LDC,
@@ -195,7 +195,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFB, SLARFT, SORMR2, XERBLA
@@ -250,7 +251,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -337,7 +338,7 @@
      $                   WORK, LDWORK )
    10    CONTINUE
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMRQ
diff --git a/lapack-netlib/SRC/sormrz.f b/lapack-netlib/SRC/sormrz.f
index cdadd62b5..b037a984b 100644
--- a/lapack-netlib/SRC/sormrz.f
+++ b/lapack-netlib/SRC/sormrz.f
@@ -168,7 +168,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmrz
 *
 *> \par Contributors:
 *  ==================
@@ -213,7 +213,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARZB, SLARZT, SORMR3, XERBLA
@@ -271,7 +272,7 @@
      $                               K, -1 ) )
             LWKOPT = NW*NB + TSIZE
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -367,7 +368,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/sormtr.f b/lapack-netlib/SRC/sormtr.f
index 3ba749fee..1bc87768f 100644
--- a/lapack-netlib/SRC/sormtr.f
+++ b/lapack-netlib/SRC/sormtr.f
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup unmtr
 *
 *  =====================================================================
       SUBROUTINE SORMTR( SIDE, UPLO, TRANS, M, N, A, LDA, TAU, C, LDC,
@@ -192,7 +192,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SORMQL, SORMQR, XERBLA
@@ -256,7 +257,7 @@
             END IF
          END IF
          LWKOPT = NW*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -301,7 +302,7 @@
          CALL SORMQR( SIDE, TRANS, MI, NI, NQ-1, A( 2, 1 ), LDA, TAU,
      $                C( I1, I2 ), LDC, WORK, LWORK, IINFO )
       END IF
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SORMTR
diff --git a/lapack-netlib/SRC/spbsv.f b/lapack-netlib/SRC/spbsv.f
index 2d084424e..8929321ea 100644
--- a/lapack-netlib/SRC/spbsv.f
+++ b/lapack-netlib/SRC/spbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/spbsvx.f b/lapack-netlib/SRC/spbsvx.f
index 27907eb85..6abb6caae 100644
--- a/lapack-netlib/SRC/spbsvx.f
+++ b/lapack-netlib/SRC/spbsvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -281,10 +281,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/spbtf2.f b/lapack-netlib/SRC/spbtf2.f
index ff08bc327..b5aac1ccd 100644
--- a/lapack-netlib/SRC/spbtf2.f
+++ b/lapack-netlib/SRC/spbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spbtrf.f b/lapack-netlib/SRC/spbtrf.f
index ef5dcbb98..d3ae216e5 100644
--- a/lapack-netlib/SRC/spbtrf.f
+++ b/lapack-netlib/SRC/spbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spftrf.f b/lapack-netlib/SRC/spftrf.f
index 9e2c11eea..0ed0e3abd 100644
--- a/lapack-netlib/SRC/spftrf.f
+++ b/lapack-netlib/SRC/spftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/sposv.f b/lapack-netlib/SRC/sposv.f
index fe0a35a56..336332d2d 100644
--- a/lapack-netlib/SRC/sposv.f
+++ b/lapack-netlib/SRC/sposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/sposvx.f b/lapack-netlib/SRC/sposvx.f
index bcf38c7ea..0770897d2 100644
--- a/lapack-netlib/SRC/sposvx.f
+++ b/lapack-netlib/SRC/sposvx.f
@@ -71,7 +71,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -277,10 +277,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/sposvxx.f b/lapack-netlib/SRC/sposvxx.f
index 08adf1d58..19e599f64 100644
--- a/lapack-netlib/SRC/sposvxx.f
+++ b/lapack-netlib/SRC/sposvxx.f
@@ -88,7 +88,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/spotf2.f b/lapack-netlib/SRC/spotf2.f
index 5b3504834..773b768b2 100644
--- a/lapack-netlib/SRC/spotf2.f
+++ b/lapack-netlib/SRC/spotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spotrf.f b/lapack-netlib/SRC/spotrf.f
index 5d5771c86..12ef58a40 100644
--- a/lapack-netlib/SRC/spotrf.f
+++ b/lapack-netlib/SRC/spotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spotrf2.f b/lapack-netlib/SRC/spotrf2.f
index ae0484ce1..ef731ffa2 100644
--- a/lapack-netlib/SRC/spotrf2.f
+++ b/lapack-netlib/SRC/spotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/sppsv.f b/lapack-netlib/SRC/sppsv.f
index 2d3fb3d91..1f48dd4db 100644
--- a/lapack-netlib/SRC/sppsv.f
+++ b/lapack-netlib/SRC/sppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/sppsvx.f b/lapack-netlib/SRC/sppsvx.f
index 7d71efcd5..bd2da20ee 100644
--- a/lapack-netlib/SRC/sppsvx.f
+++ b/lapack-netlib/SRC/sppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/spptrf.f b/lapack-netlib/SRC/spptrf.f
index 0f90399cc..be51c4824 100644
--- a/lapack-netlib/SRC/spptrf.f
+++ b/lapack-netlib/SRC/spptrf.f
@@ -79,8 +79,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/spteqr.f b/lapack-netlib/SRC/spteqr.f
index 0d6bf911b..6e6e9aa45 100644
--- a/lapack-netlib/SRC/spteqr.f
+++ b/lapack-netlib/SRC/spteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/sptsv.f b/lapack-netlib/SRC/sptsv.f
index 46aadf4c3..f11c22594 100644
--- a/lapack-netlib/SRC/sptsv.f
+++ b/lapack-netlib/SRC/sptsv.f
@@ -93,8 +93,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sptsvx.f b/lapack-netlib/SRC/sptsvx.f
index 8dd04b054..eaa691cef 100644
--- a/lapack-netlib/SRC/sptsvx.f
+++ b/lapack-netlib/SRC/sptsvx.f
@@ -59,7 +59,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**T*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -199,10 +199,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/spttrf.f b/lapack-netlib/SRC/spttrf.f
index 2217a7338..e083b7456 100644
--- a/lapack-netlib/SRC/spttrf.f
+++ b/lapack-netlib/SRC/spttrf.f
@@ -70,8 +70,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/ssbev_2stage.f b/lapack-netlib/SRC/ssbev_2stage.f
index 5752c1ecc..71ace4e27 100644
--- a/lapack-netlib/SRC/ssbev_2stage.f
+++ b/lapack-netlib/SRC/ssbev_2stage.f
@@ -131,7 +131,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS + N
@@ -164,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -182,7 +182,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -190,11 +190,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -232,12 +232,13 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSB
-      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE
+      REAL               SLAMCH, SLANSB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASCL, SSCAL, SSTEQR, SSTERF, XERBLA,
-     $                   SSYTRD_SB2ST 
+     $                   SSYTRD_SB2ST
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          SQRT
@@ -268,7 +269,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             IB    = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', JOBZ,
      $                            N, KD, -1, -1 )
@@ -277,7 +278,7 @@
             LWTRD = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = N + LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          ENDIF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -343,7 +344,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
-     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, call SSTEQR.
@@ -368,7 +369,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssbevd.f b/lapack-netlib/SRC/ssbevd.f
index bcf14ce85..e4118dbed 100644
--- a/lapack-netlib/SRC/ssbevd.f
+++ b/lapack-netlib/SRC/ssbevd.f
@@ -40,12 +40,6 @@
 *> a real symmetric band matrix A. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -185,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbevd
 *
 *  =====================================================================
       SUBROUTINE SSBEVD( JOBZ, UPLO, N, KD, AB, LDAB, W, Z, LDZ, WORK,
@@ -219,8 +213,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANSB
-      EXTERNAL           LSAME, SLAMCH, SLANSB
+      REAL               SLAMCH, SLANSB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSB, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SLASCL, SSBTRD, SSCAL, SSTEDC,
@@ -265,7 +259,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -348,7 +342,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssbevd_2stage.f b/lapack-netlib/SRC/ssbevd_2stage.f
index 9687ee024..de3f1c010 100644
--- a/lapack-netlib/SRC/ssbevd_2stage.f
+++ b/lapack-netlib/SRC/ssbevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses
 *> a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -140,7 +134,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS + N
@@ -194,7 +188,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbevd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -212,7 +206,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -220,11 +214,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -264,8 +258,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSB
-      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE
+      REAL               SLAMCH, SLANSB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SLASCL, SSCAL, SSTEDC,
@@ -313,7 +308,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 )  = LWMIN
+         WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -380,7 +375,7 @@
       LLWRK2  = LWORK - INDWK2 + 1
 *
       CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, W,
-     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, call SSTEDC.
@@ -400,7 +395,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssbevx_2stage.f b/lapack-netlib/SRC/ssbevx_2stage.f
index 224b676b7..d25d3639a 100644
--- a/lapack-netlib/SRC/ssbevx_2stage.f
+++ b/lapack-netlib/SRC/ssbevx_2stage.f
@@ -235,7 +235,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 7*N, dimension) where
 *>                                   dimension = (2KD+1)*N + KD*NTHREADS + 2*N
@@ -281,7 +281,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbevx_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -299,7 +299,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -307,11 +307,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -357,8 +357,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSB
-      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE
+      REAL               SLAMCH, SLANSB, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSB, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SGEMV, SLACPY, SLASCL, SSCAL,
@@ -414,7 +415,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             IB    = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', JOBZ,
      $                            N, KD, -1, -1 )
@@ -423,7 +424,7 @@
             LWTRD = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', JOBZ,
      $                            N, KD, IB, -1 )
             LWMIN = 2*N + LHTRD + LWTRD
-            WORK( 1 )  = LWMIN
+            WORK( 1 )  = SROUNDUP_LWORK(LWMIN)
          ENDIF
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY )
@@ -513,7 +514,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL SSYTRD_SB2ST( "N", JOBZ, UPLO, N, KD, AB, LDAB, WORK( INDD ),
-     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDE ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     If all eigenvalues are desired and ABSTOL is less than or equal
@@ -624,7 +625,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssbgvd.f b/lapack-netlib/SRC/ssbgvd.f
index 6dd1fe952..f872e5464 100644
--- a/lapack-netlib/SRC/ssbgvd.f
+++ b/lapack-netlib/SRC/ssbgvd.f
@@ -43,12 +43,6 @@
 *> banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -214,7 +208,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hbgvd
 *
 *> \par Contributors:
 *  ==================
@@ -253,7 +247,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SPBSTF, SSBGST, SSBTRD, SSTEDC,
@@ -298,7 +293,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -359,7 +354,7 @@
          CALL SLACPY( 'A', N, N, WORK( INDWK2 ), N, Z, LDZ )
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssbgvx.f b/lapack-netlib/SRC/ssbgvx.f
index 271f35964..3607fae82 100644
--- a/lapack-netlib/SRC/ssbgvx.f
+++ b/lapack-netlib/SRC/ssbgvx.f
@@ -267,7 +267,7 @@
 *>                  Their indices are stored in IFAIL.
 *>          > N:  SPBSTF returned an error code; i.e.,
 *>                if INFO = N + i, for 1 <= i <= N, then the leading
-*>                minor of order i of B is not positive definite.
+*>                principal minor of order i of B is not positive.
 *>                The factorization of B could not be completed and
 *>                no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sspevd.f b/lapack-netlib/SRC/sspevd.f
index 56329da34..1aae48d1d 100644
--- a/lapack-netlib/SRC/sspevd.f
+++ b/lapack-netlib/SRC/sspevd.f
@@ -40,12 +40,6 @@
 *> of a real symmetric matrix A in packed storage. If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -170,7 +164,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hpevd
 *
 *  =====================================================================
       SUBROUTINE SSPEVD( JOBZ, UPLO, N, AP, W, Z, LDZ, WORK, LWORK,
@@ -204,8 +198,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANSP
-      EXTERNAL           LSAME, SLAMCH, SLANSP
+      REAL               SLAMCH, SLANSP, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSP, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SOPMTR, SSCAL, SSPTRD, SSTEDC, SSTERF, XERBLA
@@ -246,7 +240,7 @@
             END IF
          END IF
          IWORK( 1 ) = LIWMIN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -9
@@ -325,7 +319,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/sspgv.f b/lapack-netlib/SRC/sspgv.f
index c73e94e60..e8bc66e5d 100644
--- a/lapack-netlib/SRC/sspgv.f
+++ b/lapack-netlib/SRC/sspgv.f
@@ -139,7 +139,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero.
 *>             > N:   if INFO = n + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sspgvd.f b/lapack-netlib/SRC/sspgvd.f
index 73862ed1b..c1e14594b 100644
--- a/lapack-netlib/SRC/sspgvd.f
+++ b/lapack-netlib/SRC/sspgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -184,7 +178,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -197,7 +191,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup hpgvd
 *
 *> \par Contributors:
 *  ==================
@@ -231,7 +225,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPPTRF, SSPEVD, SSPGST, STPMV, STPSV, XERBLA
@@ -273,7 +268,7 @@
                LWMIN = 2*N
             END IF
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -11
@@ -351,7 +346,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/sspgvx.f b/lapack-netlib/SRC/sspgvx.f
index de581543a..6d5b4ed3d 100644
--- a/lapack-netlib/SRC/sspgvx.f
+++ b/lapack-netlib/SRC/sspgvx.f
@@ -245,7 +245,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sstedc.f b/lapack-netlib/SRC/sstedc.f
index 925b03422..0e1cb4258 100644
--- a/lapack-netlib/SRC/sstedc.f
+++ b/lapack-netlib/SRC/sstedc.f
@@ -42,12 +42,6 @@
 *> found if SSYTRD or SSPTRD or SSBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See SLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
@@ -173,7 +167,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup auxOTHERcomputational
+*> \ingroup stedc
 *
 *> \par Contributors:
 *  ==================
@@ -214,8 +208,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANST
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SLAED0, SLASCL, SLASET, SLASRT,
@@ -274,7 +268,7 @@
                LIWMIN = 3 + 5*N
             END IF
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT. LQUERY ) THEN
@@ -469,7 +463,7 @@
       END IF
 *
    50 CONTINUE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/sstegr.f b/lapack-netlib/SRC/sstegr.f
index 2e2975fdf..2967a6969 100644
--- a/lapack-netlib/SRC/sstegr.f
+++ b/lapack-netlib/SRC/sstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : SSTEGR and SSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sstemr.f b/lapack-netlib/SRC/sstemr.f
index 3a9bbe784..62cfa3d4d 100644
--- a/lapack-netlib/SRC/sstemr.f
+++ b/lapack-netlib/SRC/sstemr.f
@@ -303,7 +303,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -312,7 +312,8 @@
 *> Jim Demmel, University of California, Berkeley, USA \n
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
-*> Christof Voemel, University of California, Berkeley, USA
+*> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE SSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -344,7 +345,8 @@
      $                     MINRGP = 3.0E-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -357,8 +359,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANST
-      EXTERNAL           LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLAE2, SLAEV2, SLARRC, SLARRE, SLARRJ,
@@ -378,6 +380,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     SSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, SLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -440,7 +443,7 @@
       RMAX = MIN( SQRT( BIGNUM ), ONE / SQRT( SQRT( SAFMIN ) ) )
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( WANTZ .AND. ALLEIG ) THEN
@@ -500,6 +503,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL SLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -507,8 +519,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -531,8 +548,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -760,7 +782,7 @@
       ENDIF
 *
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/sstevd.f b/lapack-netlib/SRC/sstevd.f
index bc5b5aaab..4fc2a6311 100644
--- a/lapack-netlib/SRC/sstevd.f
+++ b/lapack-netlib/SRC/sstevd.f
@@ -40,12 +40,6 @@
 *> real symmetric tridiagonal matrix. If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -155,7 +149,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup stevd
 *
 *  =====================================================================
       SUBROUTINE SSTEVD( JOBZ, N, D, E, Z, LDZ, WORK, LWORK, IWORK,
@@ -188,8 +182,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLAMCH, SLANST
-      EXTERNAL           LSAME, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSCAL, SSTEDC, SSTERF, XERBLA
@@ -221,7 +215,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -289,7 +283,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, D, 1 )
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/sstevr.f b/lapack-netlib/SRC/sstevr.f
index 2ab63eb3e..42f49b11b 100644
--- a/lapack-netlib/SRC/sstevr.f
+++ b/lapack-netlib/SRC/sstevr.f
@@ -287,7 +287,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHEReigen
+*> \ingroup stevr
 *
 *> \par Contributors:
 *  ==================
@@ -336,8 +336,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANST
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANST
+      REAL               SLAMCH, SLANST, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANST, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SSCAL, SSTEBZ, SSTEMR, SSTEIN, SSTERF,
@@ -389,7 +389,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -570,7 +570,7 @@
 *      IF (wantz .and. INDEIG ) Z( 1,1) = Z(1,1) / 1.002 + .002
 *
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssyconvf.f b/lapack-netlib/SRC/ssyconvf.f
index 6defc1f0e..af55da51a 100644
--- a/lapack-netlib/SRC/ssyconvf.f
+++ b/lapack-netlib/SRC/ssyconvf.f
@@ -39,7 +39,7 @@
 *> SSYTRF provided on entry in parameter A into the factorization
 *> output format used in SSYTRF_RK (or SSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in SSYTRF into
+*> the interchanges stored in IPIV from the format used in SSYTRF into
 *> the format used in SSYTRF_RK (or SSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or SSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in SSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in SSYTRF_RK
+*> the interchanges stored in IPIV from the format used in SSYTRF_RK
 *> (or SSYTRF_BK) into the format used in SSYTRF.
 *> \endverbatim
 *
@@ -322,7 +322,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -466,7 +466,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -532,7 +532,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/ssyconvf_rook.f b/lapack-netlib/SRC/ssyconvf_rook.f
index c59f257bb..efd7c57fe 100644
--- a/lapack-netlib/SRC/ssyconvf_rook.f
+++ b/lapack-netlib/SRC/ssyconvf_rook.f
@@ -517,7 +517,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/ssyev.f b/lapack-netlib/SRC/ssyev.f
index 03ed326b8..638445f04 100644
--- a/lapack-netlib/SRC/ssyev.f
+++ b/lapack-netlib/SRC/ssyev.f
@@ -125,7 +125,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heev
 *
 *  =====================================================================
       SUBROUTINE SSYEV( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, INFO )
@@ -158,8 +158,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF, SSYTRD,
@@ -190,7 +190,7 @@
       IF( INFO.EQ.0 ) THEN
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, ( NB+2 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.MAX( 1, 3*N-1 ) .AND. .NOT.LQUERY )
      $      INFO = -8
@@ -274,7 +274,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssyev_2stage.f b/lapack-netlib/SRC/ssyev_2stage.f
index a6fa30cc8..519ee334d 100644
--- a/lapack-netlib/SRC/ssyev_2stage.f
+++ b/lapack-netlib/SRC/ssyev_2stage.f
@@ -20,7 +20,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+*       SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
 *                                INFO )
 *
 *       IMPLICIT NONE
@@ -105,12 +105,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 2*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -143,7 +143,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heev_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -161,7 +161,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -169,16 +169,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
 *  =====================================================================
-      SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK, 
+      SUBROUTINE SSYEV_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK, LWORK,
      $                         INFO )
 *
       IMPLICIT NONE
@@ -211,8 +211,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV2STAGE
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASCL, SORGTR, SSCAL, SSTEQR, SSTERF,
@@ -305,7 +306,7 @@
       LLWORK  = LWORK - INDWRK + 1
 *
       CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, W, WORK( INDE ),
-     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDTAU ), WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWRK ), LLWORK, IINFO )
 *
 *     For eigenvalues only, call SSTERF.  For eigenvectors, first call
@@ -336,7 +337,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssyevd.f b/lapack-netlib/SRC/ssyevd.f
index ac0d0284d..2ae44fc81 100644
--- a/lapack-netlib/SRC/ssyevd.f
+++ b/lapack-netlib/SRC/ssyevd.f
@@ -40,13 +40,6 @@
 *> real symmetric matrix A. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
-*>
 *> Because of large use of BLAS of level 3, SSYEVD needs N**2 more
 *> workspace than SSYEVX.
 *> \endverbatim
@@ -103,8 +96,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array,
-*>                                         dimension (LWORK)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -167,7 +159,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevd
 *
 *> \par Contributors:
 *  ==================
@@ -211,8 +203,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLACPY, SLASCL, SORMTR, SSCAL, SSTEDC, SSTERF,
@@ -258,7 +250,7 @@
      $                  N*ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) )
             LIOPT = LIWMIN
          END IF
-         WORK( 1 ) = LOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LOPT )
          IWORK( 1 ) = LIOPT
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -342,7 +334,7 @@
       IF( ISCALE.EQ.1 )
      $   CALL SSCAL( N, ONE / SIGMA, W, 1 )
 *
-      WORK( 1 ) = LOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LOPT )
       IWORK( 1 ) = LIOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevd_2stage.f b/lapack-netlib/SRC/ssyevd_2stage.f
index f3fde6b4a..e63e280a7 100644
--- a/lapack-netlib/SRC/ssyevd_2stage.f
+++ b/lapack-netlib/SRC/ssyevd_2stage.f
@@ -45,12 +45,6 @@
 *> the reduction to tridiagonal. If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/ssyevr.f b/lapack-netlib/SRC/ssyevr.f
index d8e4ce3ea..870facd60 100644
--- a/lapack-netlib/SRC/ssyevr.f
+++ b/lapack-netlib/SRC/ssyevr.f
@@ -271,7 +271,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,26*N).
+*>          The dimension of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 26*N.
 *>          For optimal efficiency, LWORK >= (NB+6)*N,
 *>          where NB is the max of the blocksize for SSYTRD and SORMTR
 *>          returned by ILAENV.
@@ -292,7 +293,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK and
@@ -317,7 +319,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -368,8 +370,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN,
@@ -392,8 +394,13 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LIWORK.EQ.-1 ) )
 *
-      LWMIN = MAX( 1, 26*N )
-      LIWMIN = MAX( 1, 10*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 26*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -428,7 +435,7 @@
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
          IWORK( 1 ) = LIWMIN
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -677,7 +684,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevr_2stage.f b/lapack-netlib/SRC/ssyevr_2stage.f
index 8ab2844c6..471e25977 100644
--- a/lapack-netlib/SRC/ssyevr_2stage.f
+++ b/lapack-netlib/SRC/ssyevr_2stage.f
@@ -263,7 +263,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of SSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the orthogonal transformations applied by SORMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -277,12 +277,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 5*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 5*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -300,13 +301,14 @@
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (MAX(1,LIWORK))
-*>          On exit, if INFO = 0, IWORK(1) returns the optimal LWORK.
+*>          On exit, if INFO = 0, IWORK(1) returns the optimal LIWORK.
 *> \endverbatim
 *>
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the IWORK array,
@@ -330,7 +332,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -358,7 +360,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -366,11 +368,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -416,8 +418,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV, ILAENV2STAGE
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV, ILAENV2STAGE
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK, ILAENV,
+     $                   ILAENV2STAGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SORMTR, SSCAL, SSTEBZ, SSTEMR, SSTEIN,
@@ -444,8 +447,14 @@
       IB     = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = MAX( 26*N, 5*N + LHTRD + LWTRD )
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -484,7 +493,7 @@
 *         NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
 *         NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
 *         LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWMIN
+         WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
          IWORK( 1 ) = LIWMIN
       END IF
 *
@@ -504,7 +513,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 26
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = A( 1, 1 )
@@ -608,7 +617,7 @@
 *     Call SSYTRD_2STAGE to reduce symmetric matrix to tridiagonal form.
 *
 *
-      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
      $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
      $                    LHTRD, WORK( INDWK ), LLWORK, IINFO )
 *
@@ -732,7 +741,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 )  = SROUNDUP_LWORK( LWMIN )
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssyevx.f b/lapack-netlib/SRC/ssyevx.f
index 11776e8c5..aaed6dad5 100644
--- a/lapack-netlib/SRC/ssyevx.f
+++ b/lapack-netlib/SRC/ssyevx.f
@@ -244,7 +244,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevx
 *
 *  =====================================================================
       SUBROUTINE SSYEVX( JOBZ, RANGE, UPLO, N, A, LDA, VL, VU, IL, IU,
@@ -285,8 +285,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ,
@@ -338,14 +338,14 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWKMIN = 1
-            WORK( 1 ) = LWKMIN
+            LWKOPT = 1
          ELSE
             LWKMIN = 8*N
             NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
             NB = MAX( NB, ILAENV( 1, 'SORMTR', UPLO, N, -1, -1, -1 ) )
             LWKOPT = MAX( LWKMIN, ( NB + 3 )*N )
-            WORK( 1 ) = LWKOPT
          END IF
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY )
      $      INFO = -17
@@ -542,7 +542,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssyevx_2stage.f b/lapack-netlib/SRC/ssyevx_2stage.f
index 1a2225c87..a8585e5f7 100644
--- a/lapack-netlib/SRC/ssyevx_2stage.f
+++ b/lapack-netlib/SRC/ssyevx_2stage.f
@@ -208,12 +208,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 8*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 3*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 3*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -259,7 +259,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup heevx_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -277,7 +277,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -285,11 +285,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -334,8 +334,9 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV2STAGE
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLAMCH, SLANSY, ILAENV2STAGE,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SCOPY, SLACPY, SORGTR, SORMTR, SSCAL, SSTEBZ,
@@ -388,7 +389,7 @@
       IF( INFO.EQ.0 ) THEN
          IF( N.LE.1 ) THEN
             LWMIN = 1
-            WORK( 1 ) = LWMIN
+            WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          ELSE
             KD    = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', JOBZ,
      $                            N, -1, -1, -1 )
@@ -487,7 +488,7 @@
       INDWRK  = INDHOUS + LHTRD
       LLWORK  = LWORK - INDWRK + 1
 *
-      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ), 
+      CALL SSYTRD_2STAGE( JOBZ, UPLO, N, A, LDA, WORK( INDD ),
      $                    WORK( INDE ), WORK( INDTAU ), WORK( INDHOUS ),
      $                    LHTRD, WORK( INDWRK ), LLWORK, IINFO )
 *
@@ -600,7 +601,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssygv.f b/lapack-netlib/SRC/ssygv.f
index 270957fce..3a79f5431 100644
--- a/lapack-netlib/SRC/ssygv.f
+++ b/lapack-netlib/SRC/ssygv.f
@@ -154,7 +154,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -167,7 +167,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup hegv
 *
 *  =====================================================================
       SUBROUTINE SSYGV( ITYPE, JOBZ, UPLO, N, A, LDA, B, LDB, W, WORK,
@@ -199,7 +199,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPOTRF, SSYEV, SSYGST, STRMM, STRSM, XERBLA
@@ -234,7 +235,7 @@
          LWKMIN = MAX( 1, 3*N - 1 )
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( LWKMIN, ( NB + 2 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -11
@@ -303,7 +304,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SSYGV
diff --git a/lapack-netlib/SRC/ssygv_2stage.f b/lapack-netlib/SRC/ssygv_2stage.f
index 49f357d90..8719d8c7a 100644
--- a/lapack-netlib/SRC/ssygv_2stage.f
+++ b/lapack-netlib/SRC/ssygv_2stage.f
@@ -143,12 +143,12 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The length of the array WORK. LWORK >= 1, when N <= 1;
-*>          otherwise  
+*>          otherwise
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + 2*N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + 2*N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -173,7 +173,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -186,7 +186,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup hegv_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -204,7 +204,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -212,11 +212,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -252,7 +252,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPOTRF, SSYGST, STRMM, STRSM, XERBLA,
@@ -359,7 +360,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of SSYGV_2STAGE
diff --git a/lapack-netlib/SRC/ssygvd.f b/lapack-netlib/SRC/ssygvd.f
index 7c7e0de01..a90d1afb7 100644
--- a/lapack-netlib/SRC/ssygvd.f
+++ b/lapack-netlib/SRC/ssygvd.f
@@ -42,12 +42,6 @@
 *> B are assumed to be symmetric and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -190,7 +184,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -203,7 +197,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup hegvd
 *
 *> \par Further Details:
 *  =====================
@@ -251,7 +245,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPOTRF, SSYEVD, SSYGST, STRMM, STRSM, XERBLA
@@ -295,7 +290,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LOPT)
          IWORK( 1 ) = LIOPT
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -367,7 +362,7 @@
          END IF
       END IF
 *
-      WORK( 1 ) = LOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LOPT)
       IWORK( 1 ) = LIOPT
 *
       RETURN
diff --git a/lapack-netlib/SRC/ssygvx.f b/lapack-netlib/SRC/ssygvx.f
index e93da60cc..16adefa22 100644
--- a/lapack-netlib/SRC/ssygvx.f
+++ b/lapack-netlib/SRC/ssygvx.f
@@ -270,7 +270,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
@@ -283,7 +283,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYeigen
+*> \ingroup hegvx
 *
 *> \par Contributors:
 *  ==================
@@ -324,7 +324,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           ILAENV, LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SPOTRF, SSYEVX, SSYGST, STRMM, STRSM, XERBLA
@@ -380,7 +381,7 @@
          LWKMIN = MAX( 1, 8*N )
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( LWKMIN, ( NB + 3 )*N )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -453,7 +454,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv.f b/lapack-netlib/SRC/ssysv.f
index 06a42dfb7..523ea66c1 100644
--- a/lapack-netlib/SRC/ssysv.f
+++ b/lapack-netlib/SRC/ssysv.f
@@ -163,7 +163,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesv
 *
 *  =====================================================================
       SUBROUTINE SSYSV( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -190,7 +190,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRF, SSYTRS, SSYTRS2
@@ -225,7 +226,7 @@
             CALL SSYTRF( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -258,7 +259,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f
index 5661332c5..711a275e1 100644
--- a/lapack-netlib/SRC/ssysv_aa.f
+++ b/lapack-netlib/SRC/ssysv_aa.f
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE SSYSV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -177,11 +177,13 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_SYTRF, LWKOPT_SYTRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRS_AA, SSYTRF_AA
@@ -195,6 +197,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -205,18 +208,18 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_SYTRF = INT( WORK(1) )
+         LWKOPT_SYTRF = INT( WORK( 1 ) )
          CALL SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_SYTRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_SYTRF, LWKOPT_SYTRS )
-         WORK( 1 ) = LWKOPT
+         LWKOPT_SYTRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_SYTRF, LWKOPT_SYTRS )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -238,7 +241,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f
index b9eac8342..fb068b3bf 100644
--- a/lapack-netlib/SRC/ssysv_aa_2stage.f
+++ b/lapack-netlib/SRC/ssysv_aa_2stage.f
@@ -88,7 +88,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -100,14 +100,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is REAL array, dimension (LTB)
+*>          TB is REAL array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -147,14 +147,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL workspace of size LWORK
+*>          WORK is REAL workspace of size (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -178,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE SSYSV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -204,11 +205,13 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKMIN, LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSYTRF_AA_2STAGE, SSYTRS_AA_2STAGE,
@@ -225,6 +228,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -233,18 +237,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -254,7 +259,6 @@
          RETURN
       END IF
 *
-*
 *     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
@@ -268,7 +272,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_rk.f b/lapack-netlib/SRC/ssysv_rk.f
index 9a7dfa4bb..abf862d66 100644
--- a/lapack-netlib/SRC/ssysv_rk.f
+++ b/lapack-netlib/SRC/ssysv_rk.f
@@ -205,7 +205,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleSYsolve
+*> \ingroup hesv_rk
 *
 *> \par Contributors:
 *  ==================
@@ -247,7 +247,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRF_RK, SSYTRS_3
@@ -282,7 +283,7 @@
             CALL SSYTRF_RK( UPLO, N, A, LDA, E, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -305,7 +306,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysv_rook.f b/lapack-netlib/SRC/ssysv_rook.f
index fb7ba8c53..c5c77e562 100644
--- a/lapack-netlib/SRC/ssysv_rook.f
+++ b/lapack-netlib/SRC/ssysv_rook.f
@@ -181,7 +181,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesv_rook
 *
 *> \par Contributors:
 *  ==================
@@ -223,7 +223,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SSYTRF_ROOK, SSYTRS_ROOK
@@ -258,7 +259,7 @@
             CALL SSYTRF_ROOK( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
             LWKOPT = INT( WORK( 1 ) )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -281,7 +282,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssysvx.f b/lapack-netlib/SRC/ssysvx.f
index b19ce2641..06a6413f1 100644
--- a/lapack-netlib/SRC/ssysvx.f
+++ b/lapack-netlib/SRC/ssysvx.f
@@ -275,7 +275,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE SSYSVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -305,14 +305,14 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKMIN, LWKOPT, NB
       REAL               ANORM
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      REAL               SLAMCH, SLANSY
-      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY
+      REAL               SLAMCH, SLANSY, SROUNDUP_LWORK
+      EXTERNAL           ILAENV, LSAME, SLAMCH, SLANSY, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLACPY, SSYCON, SSYRFS, SSYTRF, SSYTRS, XERBLA
@@ -327,6 +327,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 3*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -344,17 +345,17 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 3*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 3*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -404,7 +405,7 @@
       IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
      $   INFO = N + 1
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssytrd.f b/lapack-netlib/SRC/ssytrd.f
index f09ad9ab4..f4fbecdc9 100644
--- a/lapack-netlib/SRC/ssytrd.f
+++ b/lapack-netlib/SRC/ssytrd.f
@@ -139,7 +139,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrd
 *
 *> \par Further Details:
 *  =====================
@@ -223,7 +223,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -248,7 +249,7 @@
 *
          NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
          LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -365,7 +366,7 @@
      $                TAU( I ), IINFO )
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
       RETURN
 *
 *     End of SSYTRD
diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f
index 5d70ae0d4..5b401c3d0 100644
--- a/lapack-netlib/SRC/ssytrd_2stage.f
+++ b/lapack-netlib/SRC/ssytrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download SSYTRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_2stage.f"> 
+*> Download SSYTRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssytrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssytrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssytrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       REAL               A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the orthogonal
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is REAL array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is REAL array, dimension (LHOUS2)
+*>          HOUS2 is REAL array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -149,17 +151,19 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +181,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +206,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +214,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE SSYTRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -265,10 +269,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'SSYTRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'SSYTRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'SSYTRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'SSYTRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -309,14 +316,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL SSYTRD_SY2SB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
          RETURN
       END IF
-      CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL SSYTRD_SB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRD_SB2ST', -INFO )
@@ -324,8 +331,7 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
-      WORK( 1 )  = LWMIN
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of SSYTRD_2STAGE
diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F
index faeeff5f4..111eaa93e 100644
--- a/lapack-netlib/SRC/ssytrd_sb2st.F
+++ b/lapack-netlib/SRC/ssytrd_sb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the ssytrd_sy2sb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the ssytrd_sy2sb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the ssytrd_sy2sb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of ssytrd_sy2sb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -132,34 +132,39 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is REAL array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is REAL array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension)
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension LWORK.
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          IF N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension)
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -188,7 +193,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup real16OTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +213,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +221,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE SSYTRD_SB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 #if defined(_OPENMP)
@@ -258,11 +263,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   NBTILES, TTYPE, TID, NTHREADS,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SISEV, SIZETAU, LDV, LHMIN, LWMIN
 *     ..
@@ -274,15 +279,15 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
-      EXTERNAL           LSAME, ILAENV2STAGE
+      INTEGER            ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -291,9 +296,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'SSYTRD_SB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -314,8 +324,8 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         HOUS( 1 ) = LHMIN
-         WORK( 1 ) = LWMIN
+         HOUS( 1 ) = SROUNDUP_LWORK( LHMIN )
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -355,7 +365,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -363,11 +373,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     real because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     real because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -382,17 +392,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = ( AB( ABDPOS, I ) )
@@ -413,7 +423,7 @@
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the symmetric band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -422,7 +432,7 @@
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL SLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL SLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -431,7 +441,7 @@
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -440,7 +450,7 @@
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -451,7 +461,7 @@
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -477,16 +487,16 @@
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) && _OPENMP >= 201307
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -494,20 +504,20 @@
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL SSB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
-#endif 
+     $                         WORK( INDW ) )
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -522,14 +532,14 @@
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = ( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -543,11 +553,10 @@
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SSYTRD_SB2ST
 *
       END
-      
+
diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f
index 2c92cd14a..3996e07bb 100644
--- a/lapack-netlib/SRC/ssytrd_sy2sb.f
+++ b/lapack-netlib/SRC/ssytrd_sy2sb.f
@@ -124,7 +124,7 @@
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY)
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -158,7 +160,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -283,7 +285,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV2STAGE 
-      EXTERNAL           LSAME, ILAENV2STAGE
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV2STAGE, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -293,8 +296,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'SSYTRD_SY2SB', '', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -313,7 +320,7 @@
          CALL XERBLA( 'SSYTRD_SY2SB', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
          RETURN
       END IF
 *
@@ -506,7 +513,7 @@
 
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK( LWMIN )
       RETURN
 *
 *     End of SSYTRD_SY2SB
diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f
index d188589b9..55f3a4f0f 100644
--- a/lapack-netlib/SRC/ssytrf.f
+++ b/lapack-netlib/SRC/ssytrf.f
@@ -135,7 +135,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +202,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASYF, SSYTF2, XERBLA
@@ -232,8 +233,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -352,7 +353,8 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF
diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f
index 13e849cdc..af32fb064 100644
--- a/lapack-netlib/SRC/ssytrf_aa.f
+++ b/lapack-netlib/SRC/ssytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= MAX(1,2*N). For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N <= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -125,10 +127,10 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -142,24 +144,25 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            IPIV( * )
-      REAL   A( LDA, * ), WORK( * )
+      REAL               A( LDA, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
 *     .. Parameters ..
-      REAL   ZERO, ONE
+      REAL               ZERO, ONE
       PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            J, LWKOPT
+      INTEGER            J, LWKMIN, LWKOPT
       INTEGER            NB, MJ, NJ, K1, K2, J1, J2, J3, JB
-      REAL   ALPHA
+      REAL               ALPHA
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASYF_AA, SGEMV, SSCAL, SCOPY, SSWAP, SGEMM,
@@ -179,19 +182,26 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -203,11 +213,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          RETURN
       END IF
 *
@@ -457,7 +467,8 @@
       END IF
 *
    20 CONTINUE
-      WORK( 1 ) = LWKOPT
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF_AA
diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f
index 94b3aa4b5..6b5cdee1b 100644
--- a/lapack-netlib/SRC/ssytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -94,7 +94,7 @@
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL workspace of size LWORK
+*>          WORK is REAL workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -187,7 +187,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, SCOPY, SLACPY,
@@ -211,9 +212,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -227,10 +228,10 @@
       NB = ILAENV( 1, 'SSYTRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = SROUNDUP_LWORK( MAX( 1, (3*NB+1)*N ) )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = SROUNDUP_LWORK( MAX( 1, N*NB ) )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -239,7 +240,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f
index ec84fcb1b..89ecf38fd 100644
--- a/lapack-netlib/SRC/ssytrf_rk.f
+++ b/lapack-netlib/SRC/ssytrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension ( MAX(1,LWORK) ).
+*>          WORK is REAL array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleSYcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -280,7 +280,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASYF_RK, SSYTF2_RK, SSWAP,  XERBLA
@@ -310,8 +311,8 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -487,7 +488,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF_RK
diff --git a/lapack-netlib/SRC/ssytrf_rook.f b/lapack-netlib/SRC/ssytrf_rook.f
index 653289e2b..7c2cbbc57 100644
--- a/lapack-netlib/SRC/ssytrf_rook.f
+++ b/lapack-netlib/SRC/ssytrf_rook.f
@@ -118,7 +118,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -146,7 +146,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
@@ -228,7 +228,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLASYF_ROOK, SSYTF2_ROOK, XERBLA
@@ -259,7 +260,7 @@
 *
          NB = ILAENV( 1, 'SSYTRF_ROOK', UPLO, N, -1, -1, -1 )
          LWKOPT = MAX( 1, N*NB )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -382,7 +383,8 @@
       END IF
 *
    40 CONTINUE
-      WORK( 1 ) = LWKOPT
+*
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
       RETURN
 *
 *     End of SSYTRF_ROOK
diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f
index 49f6cad65..fd1c53473 100644
--- a/lapack-netlib/SRC/ssytri2.f
+++ b/lapack-netlib/SRC/ssytri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (N+NB+1)*(NB+3)
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE SSYTRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -147,7 +147,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSYTRI, SSYTRI2X, XERBLA
@@ -159,9 +160,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +178,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = SROUNDUP_LWORK( MINSIZE )
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
-
-      IF( NBMAX .GE. N ) THEN
+*
+      IF( NBMAX.GE.N ) THEN
          CALL SSYTRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL SSYTRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of SSYTRI2
diff --git a/lapack-netlib/SRC/ssytri_3.f b/lapack-netlib/SRC/ssytri_3.f
index 58d5df92a..f0152a149 100644
--- a/lapack-netlib/SRC/ssytri_3.f
+++ b/lapack-netlib/SRC/ssytri_3.f
@@ -119,16 +119,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (N+NB+1)*(NB+3).
+*>          WORK is REAL array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= (N+NB+1)*(NB+3).
+*>          The length of WORK.
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>
-*>          If LDWORK = -1, then a workspace query is assumed;
+*>          If LWORK = -1, then a workspace query is assumed;
 *>          the routine only calculates the optimal size of the optimal
 *>          size of the WORK array, returns this value as the first
 *>          entry of the WORK array, and no error message related to
@@ -152,7 +153,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup singleSYcomputational
+*> \ingroup hetri_3
 *
 *> \par Contributors:
 *  ==================
@@ -190,7 +191,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SSYTRI_3X, XERBLA
@@ -208,8 +210,13 @@
 *
 *     Determine the block size
 *
-      NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) )
-      LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      IF( N.EQ.0 ) THEN
+         LWKOPT = 1
+      ELSE
+         NB = MAX( 1, ILAENV( 1, 'SSYTRI_3', UPLO, N, -1, -1, -1 ) )
+         LWKOPT = ( N+NB+1 ) * ( NB+3 )
+      END IF
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
@@ -217,7 +224,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LWORK .LT. LWKOPT .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKOPT .AND. .NOT.LQUERY ) THEN
          INFO = -8
       END IF
 *
@@ -225,7 +232,6 @@
          CALL XERBLA( 'SSYTRI_3', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
@@ -236,7 +242,7 @@
 *
       CALL SSYTRI_3X( UPLO, N, A, LDA, E, IPIV, WORK, NB, INFO )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK( LWKOPT )
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f
index 3cfa2a206..265cf0c1d 100644
--- a/lapack-netlib/SRC/ssytrs_aa.f
+++ b/lapack-netlib/SRC/ssytrs_aa.f
@@ -105,7 +105,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -123,7 +129,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE SSYTRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -141,7 +147,7 @@
 *     ..
 *     .. Array Arguments ..
       INTEGER            IPIV( * )
-      REAL   A( LDA, * ), B( LDB, * ), WORK( * )
+      REAL               A( LDA, * ), B( LDB, * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -151,23 +157,31 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGTSV, SSWAP, SLACPY, STRSM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -178,21 +192,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'SSYTRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK( LWKMIN )
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/stgevc.f b/lapack-netlib/SRC/stgevc.f
index 15fc88c4b..be4cb1829 100644
--- a/lapack-netlib/SRC/stgevc.f
+++ b/lapack-netlib/SRC/stgevc.f
@@ -52,7 +52,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal blocks of S and P.
 *>
@@ -337,7 +337,7 @@
       EXTERNAL           LSAME, SLAMCH
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SGEMV, SLABAD, SLACPY, SLAG2, SLALN2, XERBLA
+      EXTERNAL           SGEMV, SLACPY, SLAG2, SLALN2, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN
@@ -463,7 +463,6 @@
 *
       SAFMIN = SLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL SLABAD( SAFMIN, BIG )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/stgex2.f b/lapack-netlib/SRC/stgex2.f
index d61da2327..885739ab5 100644
--- a/lapack-netlib/SRC/stgex2.f
+++ b/lapack-netlib/SRC/stgex2.f
@@ -255,7 +255,7 @@
      $                   THRESHA, THRESHB
 *     ..
 *     .. Local Arrays ..
-      INTEGER            IWORK( LDST )
+      INTEGER            IWORK( LDST + 2 )
       REAL               AI( 2 ), AR( 2 ), BE( 2 ), IR( LDST, LDST ),
      $                   IRCOP( LDST, LDST ), LI( LDST, LDST ),
      $                   LICOP( LDST, LDST ), S( LDST, LDST ),
diff --git a/lapack-netlib/SRC/stgexc.f b/lapack-netlib/SRC/stgexc.f
index d1ad79936..d68eb5fc7 100644
--- a/lapack-netlib/SRC/stgexc.f
+++ b/lapack-netlib/SRC/stgexc.f
@@ -195,7 +195,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realGEcomputational
+*> \ingroup tgexc
 *
 *> \par Contributors:
 *  ==================
@@ -241,6 +241,10 @@
       LOGICAL            LQUERY
       INTEGER            HERE, LWMIN, NBF, NBL, NBNEXT
 *     ..
+*     .. External Functions ..
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           SROUNDUP_LWORK
+*     ..
 *     .. External Subroutines ..
       EXTERNAL           STGEX2, XERBLA
 *     ..
@@ -533,7 +537,7 @@
      $      GO TO 20
       END IF
       ILST = HERE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of STGEXC
diff --git a/lapack-netlib/SRC/stgsen.f b/lapack-netlib/SRC/stgsen.f
index f1103d740..ac9c4677a 100644
--- a/lapack-netlib/SRC/stgsen.f
+++ b/lapack-netlib/SRC/stgsen.f
@@ -304,7 +304,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup tgsen
 *
 *> \par Further Details:
 *  =====================
@@ -490,8 +490,8 @@
      $                   XERBLA
 *     ..
 *     .. External Functions ..
-      REAL               SLAMCH
-      EXTERNAL           SLAMCH
+      REAL               SLAMCH, SROUNDUP_LWORK
+      EXTERNAL           SLAMCH, SROUNDUP_LWORK
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, SIGN, SQRT
@@ -571,7 +571,7 @@
          LIWMIN = 1
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
@@ -852,7 +852,7 @@
          END IF
    70 CONTINUE
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/stgsna.f b/lapack-netlib/SRC/stgsna.f
index 40f822915..e8cb28b95 100644
--- a/lapack-netlib/SRC/stgsna.f
+++ b/lapack-netlib/SRC/stgsna.f
@@ -230,7 +230,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup tgsna
 *
 *> \par Further Details:
 *  =====================
@@ -416,8 +416,9 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SDOT, SLAMCH, SLAPY2, SNRM2
-      EXTERNAL           LSAME, SDOT, SLAMCH, SLAPY2, SNRM2
+      REAL               SDOT, SLAMCH, SLAPY2, SNRM2, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SDOT, SLAMCH, SLAPY2, SNRM2,
+     $                   SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMV, SLACPY, SLAG2, STGEXC, STGSYL, XERBLA
@@ -490,7 +491,7 @@
          ELSE
             LWMIN = N
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( MM.LT.M ) THEN
             INFO = -15
@@ -632,8 +633,8 @@
                C1 = TWO*( ALPHAR*ALPHAR+ALPHAI*ALPHAI+BETA*BETA )
                C2 = FOUR*BETA*BETA*ALPHAI*ALPHAI
                ROOT1 = C1 + SQRT( C1*C1-4.0*C2 )
-               ROOT2 = C2 / ROOT1
                ROOT1 = ROOT1 / TWO
+               ROOT2 = C2 / ROOT1
                COND = MIN( SQRT( ROOT1 ), SQRT( ROOT2 ) )
             END IF
 *
@@ -689,7 +690,7 @@
      $      KS = KS + 1
 *
    20 CONTINUE
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       RETURN
 *
 *     End of STGSNA
diff --git a/lapack-netlib/SRC/stgsyl.f b/lapack-netlib/SRC/stgsyl.f
index 733c8ab9c..07a82e380 100644
--- a/lapack-netlib/SRC/stgsyl.f
+++ b/lapack-netlib/SRC/stgsyl.f
@@ -261,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realSYcomputational
+*> \ingroup tgsyl
 *
 *> \par Contributors:
 *  ==================
@@ -331,7 +331,8 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SGEMM, SLACPY, SLASET, SSCAL, STGSY2, XERBLA
@@ -384,7 +385,7 @@
          ELSE
             LWMIN = 1
          END IF
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
          IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
             INFO = -20
@@ -670,7 +671,7 @@
 *
       END IF
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f
index 5af57123b..253cbc24c 100644
--- a/lapack-netlib/SRC/strevc3.f
+++ b/lapack-netlib/SRC/strevc3.f
@@ -298,7 +298,7 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'STREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
       LQUERY = ( LWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
diff --git a/lapack-netlib/SRC/strsen.f b/lapack-netlib/SRC/strsen.f
index c0f75fb12..f7a05ae8b 100644
--- a/lapack-netlib/SRC/strsen.f
+++ b/lapack-netlib/SRC/strsen.f
@@ -231,7 +231,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup trsen
 *
 *> \par Further Details:
 *  =====================
@@ -346,8 +346,8 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      REAL               SLANGE
-      EXTERNAL           LSAME, SLANGE
+      REAL               SLANGE, SROUNDUP_LWORK
+      EXTERNAL           LSAME, SLANGE, SROUNDUP_LWORK
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLACN2, SLACPY, STREXC, STRSYL, XERBLA
@@ -427,7 +427,7 @@
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         WORK( 1 ) = LWMIN
+         WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
          IWORK( 1 ) = LIWMIN
       END IF
 *
@@ -558,7 +558,7 @@
          END IF
    60 CONTINUE
 *
-      WORK( 1 ) = LWMIN
+      WORK( 1 ) = SROUNDUP_LWORK(LWMIN)
       IWORK( 1 ) = LIWMIN
 *
       RETURN
diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f
index 28762c2ed..ef3f2da83 100644
--- a/lapack-netlib/SRC/strsyl3.f
+++ b/lapack-netlib/SRC/strsyl3.f
@@ -1223,7 +1223,7 @@
 *
          SCALOC = MIN( BIGNUM / SCAL, ONE / BUF )
          BUF = BUF * SCALOC
-         CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK )
+         CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) )
       END IF
 *
 *     Combine with buffer scaling factor. SCALE will be flushed if
diff --git a/lapack-netlib/SRC/stzrzf.f b/lapack-netlib/SRC/stzrzf.f
index e8cbb56b6..516bea5d4 100644
--- a/lapack-netlib/SRC/stzrzf.f
+++ b/lapack-netlib/SRC/stzrzf.f
@@ -116,7 +116,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERcomputational
+*> \ingroup tzrzf
 *
 *> \par Contributors:
 *  ==================
@@ -179,7 +179,8 @@
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
-      EXTERNAL           ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@@ -207,7 +208,7 @@
             LWKOPT = M*NB
             LWKMIN = MAX( 1, M )
          END IF
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
          IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
             INFO = -7
@@ -301,7 +302,7 @@
       IF( MU.GT.0 )
      $   CALL SLATRZ( MU, N, N-M, A, LDA, TAU, WORK )
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/zbbcsd.f b/lapack-netlib/SRC/zbbcsd.f
index 757e98c71..6601f4a06 100644
--- a/lapack-netlib/SRC/zbbcsd.f
+++ b/lapack-netlib/SRC/zbbcsd.f
@@ -804,7 +804,7 @@
                CALL DLARTGP( B22BULGE, B22E(I-1), RWORK(IU2SN+I-1),
      $                       RWORK(IU2CS+I-1), R )
             ELSE IF( NU .LT. MU ) THEN
-               CALL DLARTGS( B21E(I), B21E(I+1), NU, RWORK(IU2CS+I-1),
+               CALL DLARTGS( B21E(I), B21D(I+1), NU, RWORK(IU2CS+I-1),
      $                       RWORK(IU2SN+I-1) )
             ELSE
                CALL DLARTGS( B22D(I), B22E(I), MU, RWORK(IU2CS+I-1),
diff --git a/lapack-netlib/SRC/zbdsqr.f b/lapack-netlib/SRC/zbdsqr.f
index 2ad6142a7..865bb9dd5 100644
--- a/lapack-netlib/SRC/zbdsqr.f
+++ b/lapack-netlib/SRC/zbdsqr.f
@@ -204,6 +204,17 @@
 *>          algorithm through its inner loop. The algorithms stops
 *>          (and so fails to converge) if the number of passes
 *>          through the inner loop exceeds MAXITR*N**2.
+*>
+*> \endverbatim
+*
+*> \par Note:
+*  ===========
+*>
+*> \verbatim
+*>  Bug report from Cezary Dendek.
+*>  On November 3rd 2023, the INTEGER variable MAXIT = MAXITR*N**2 is
+*>  removed since it can overflow pretty easily (for N larger or equal
+*>  than 18,919). We instead use MAXITDIVN = MAXITR*N.
 *> \endverbatim
 *
 *  Authors:
@@ -214,7 +225,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup bdsqr
 *
 *  =====================================================================
       SUBROUTINE ZBDSQR( UPLO, N, NCVT, NRU, NCC, D, E, VT, LDVT, U,
@@ -255,11 +266,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LOWER, ROTATE
-      INTEGER            I, IDIR, ISUB, ITER, J, LL, LLL, M, MAXIT, NM1,
-     $                   NM12, NM13, OLDLL, OLDM
+      INTEGER            I, IDIR, ISUB, ITER, ITERDIVN, J, LL, LLL, M,
+     $                   MAXITDIVN, NM1, NM12, NM13, OLDLL, OLDM
       DOUBLE PRECISION   ABSE, ABSS, COSL, COSR, CS, EPS, F, G, H, MU,
      $                   OLDCS, OLDSN, R, SHIFT, SIGMN, SIGMX, SINL,
-     $                   SINR, SLL, SMAX, SMIN, SMINL, SMINOA,
+     $                   SINR, SLL, SMAX, SMIN, SMINOA,
      $                   SN, THRESH, TOL, TOLMUL, UNFL
 *     ..
 *     .. External Functions ..
@@ -372,7 +383,7 @@
       DO 30 I = 1, N - 1
          SMAX = MAX( SMAX, ABS( E( I ) ) )
    30 CONTINUE
-      SMINL = ZERO
+      SMIN = ZERO
       IF( TOL.GE.ZERO ) THEN
 *
 *        Relative accuracy desired
@@ -389,20 +400,21 @@
    40    CONTINUE
    50    CONTINUE
          SMINOA = SMINOA / SQRT( DBLE( N ) )
-         THRESH = MAX( TOL*SMINOA, MAXITR*N*N*UNFL )
+         THRESH = MAX( TOL*SMINOA, MAXITR*(N*(N*UNFL)) )
       ELSE
 *
 *        Absolute accuracy desired
 *
-         THRESH = MAX( ABS( TOL )*SMAX, MAXITR*N*N*UNFL )
+         THRESH = MAX( ABS( TOL )*SMAX, MAXITR*(N*(N*UNFL)) )
       END IF
 *
 *     Prepare for main iteration loop for the singular values
 *     (MAXIT is the maximum number of passes through the inner
 *     loop permitted before nonconvergence signalled.)
 *
-      MAXIT = MAXITR*N*N
-      ITER = 0
+      MAXITDIVN = MAXITR*N
+      ITERDIVN = 0
+      ITER = -1
       OLDLL = -1
       OLDM = -1
 *
@@ -418,15 +430,18 @@
 *
       IF( M.LE.1 )
      $   GO TO 160
-      IF( ITER.GT.MAXIT )
-     $   GO TO 200
+      IF( ITER.GE.N ) THEN
+         ITER = ITER - N
+         ITERDIVN = ITERDIVN + 1
+         IF( ITERDIVN.GE.MAXITDIVN )
+     $      GO TO 200
+      END IF
 *
 *     Find diagonal block of matrix to work on
 *
       IF( TOL.LT.ZERO .AND. ABS( D( M ) ).LE.THRESH )
      $   D( M ) = ZERO
       SMAX = ABS( D( M ) )
-      SMIN = SMAX
       DO 70 LLL = 1, M - 1
          LL = M - LLL
          ABSS = ABS( D( LL ) )
@@ -435,7 +450,6 @@
      $      D( LL ) = ZERO
          IF( ABSE.LE.THRESH )
      $      GO TO 80
-         SMIN = MIN( SMIN, ABSS )
          SMAX = MAX( SMAX, ABSS, ABSE )
    70 CONTINUE
       LL = 0
@@ -517,14 +531,14 @@
 *           apply convergence criterion forward
 *
             MU = ABS( D( LL ) )
-            SMINL = MU
+            SMIN = MU
             DO 100 LLL = LL, M - 1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL+1 ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   100       CONTINUE
          END IF
 *
@@ -545,14 +559,14 @@
 *           apply convergence criterion backward
 *
             MU = ABS( D( M ) )
-            SMINL = MU
+            SMIN = MU
             DO 110 LLL = M - 1, LL, -1
                IF( ABS( E( LLL ) ).LE.TOL*MU ) THEN
                   E( LLL ) = ZERO
                   GO TO 60
                END IF
                MU = ABS( D( LLL ) )*( MU / ( MU+ABS( E( LLL ) ) ) )
-               SMINL = MIN( SMINL, MU )
+               SMIN = MIN( SMIN, MU )
   110       CONTINUE
          END IF
       END IF
@@ -562,7 +576,7 @@
 *     Compute shift.  First, test if shifting would ruin relative
 *     accuracy, and if so set the shift to zero.
 *
-      IF( TOL.GE.ZERO .AND. N*TOL*( SMINL / SMAX ).LE.
+      IF( TOL.GE.ZERO .AND. N*TOL*( SMIN / SMAX ).LE.
      $    MAX( EPS, HNDRTH*TOL ) ) THEN
 *
 *        Use a zero shift to avoid loss of relative accuracy
diff --git a/lapack-netlib/SRC/zcposv.f b/lapack-netlib/SRC/zcposv.f
index 9b1940d60..649d2c049 100644
--- a/lapack-netlib/SRC/zcposv.f
+++ b/lapack-netlib/SRC/zcposv.f
@@ -187,10 +187,10 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of
-*>                (COMPLEX*16) A is not positive definite, so the
-*>                factorization could not be completed, and the solution
-*>                has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of (COMPLEX*16) A is not positive, so the factorization
+*>                could not be completed, and the solution has not been
+*>                computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zgebal.f b/lapack-netlib/SRC/zgebal.f
index d4a9e39f1..a467991d4 100644
--- a/lapack-netlib/SRC/zgebal.f
+++ b/lapack-netlib/SRC/zgebal.f
@@ -89,7 +89,7 @@
 *> \param[out] IHI
 *> \verbatim
 *>          IHI is INTEGER
-*>          ILO and IHI are set to INTEGER such that on exit
+*>          ILO and IHI are set to integers such that on exit
 *>          A(i,j) = 0 if i > j and j = 1,...,ILO-1 or I = IHI+1,...,N.
 *>          If JOB = 'N' or 'S', ILO = 1 and IHI = N.
 *> \endverbatim
@@ -155,6 +155,9 @@
 *>
 *>  Modified by Tzu-Yi Chen, Computer Science Division, University of
 *>    California at Berkeley, USA
+*>
+*>  Refactored by Evert Provoost, Department of Computer Science,
+*>    KU Leuven, Belgium
 *> \endverbatim
 *>
 *  =====================================================================
@@ -184,8 +187,8 @@
       PARAMETER          ( FACTOR = 0.95D+0 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            NOCONV
-      INTEGER            I, ICA, IEXC, IRA, J, K, L, M
+      LOGICAL            NOCONV, CANSWAP
+      INTEGER            I, ICA, IRA, J, K, L
       DOUBLE PRECISION   C, CA, F, G, R, RA, S, SFMAX1, SFMAX2, SFMIN1,
      $                   SFMIN2
 *     ..
@@ -217,176 +220,194 @@
          RETURN
       END IF
 *
-      K = 1
-      L = N
+*     Quick returns.
 *
-      IF( N.EQ.0 )
-     $   GO TO 210
+      IF( N.EQ.0 ) THEN
+         ILO = 1
+         IHI = 0
+         RETURN
+      END IF
 *
       IF( LSAME( JOB, 'N' ) ) THEN
-         DO 10 I = 1, N
+         DO I = 1, N
             SCALE( I ) = ONE
-   10    CONTINUE
-         GO TO 210
+         END DO
+         ILO = 1
+         IHI = N
+         RETURN
       END IF
 *
-      IF( LSAME( JOB, 'S' ) )
-     $   GO TO 120
-*
-*     Permutation to isolate eigenvalues if possible
-*
-      GO TO 50
-*
-*     Row and column exchange.
-*
-   20 CONTINUE
-      SCALE( M ) = J
-      IF( J.EQ.M )
-     $   GO TO 30
-*
-      CALL ZSWAP( L, A( 1, J ), 1, A( 1, M ), 1 )
-      CALL ZSWAP( N-K+1, A( J, K ), LDA, A( M, K ), LDA )
-*
-   30 CONTINUE
-      GO TO ( 40, 80 )IEXC
-*
-*     Search for rows isolating an eigenvalue and push them down.
-*
-   40 CONTINUE
-      IF( L.EQ.1 )
-     $   GO TO 210
-      L = L - 1
-*
-   50 CONTINUE
-      DO 70 J = L, 1, -1
+*     Permutation to isolate eigenvalues if possible.
 *
-         DO 60 I = 1, L
-            IF( I.EQ.J )
-     $         GO TO 60
-            IF( DBLE( A( J, I ) ).NE.ZERO .OR. DIMAG( A( J, I ) ).NE.
-     $          ZERO )GO TO 70
-   60    CONTINUE
-*
-         M = L
-         IEXC = 1
-         GO TO 20
-   70 CONTINUE
-*
-      GO TO 90
+      K = 1
+      L = N
 *
-*     Search for columns isolating an eigenvalue and push them left.
+      IF( .NOT.LSAME( JOB, 'S' ) ) THEN
 *
-   80 CONTINUE
-      K = K + 1
+*        Row and column exchange.
 *
-   90 CONTINUE
-      DO 110 J = K, L
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for rows isolating an eigenvalue and push them down.
+*
+            NOCONV = .FALSE.
+            DO I = L, 1, -1
+               CANSWAP = .TRUE.
+               DO J = 1, L
+                  IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR.
+     $                DIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( L ) = I
+                  IF( I.NE.L ) THEN
+                     CALL ZSWAP( L, A( 1, I ), 1, A( 1, L ), 1 )
+                     CALL ZSWAP( N-K+1, A( I, K ), LDA, A( L, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  IF( L.EQ.1 ) THEN
+                     ILO = 1
+                     IHI = 1
+                     RETURN
+                  END IF
+*
+                  L = L - 1
+               END IF
+            END DO
+*
+         END DO
+
+         NOCONV = .TRUE.
+         DO WHILE( NOCONV )
+*
+*           Search for columns isolating an eigenvalue and push them left.
+*
+            NOCONV = .FALSE.
+            DO J = K, L
+               CANSWAP = .TRUE.
+               DO I = K, L
+                  IF( I.NE.J .AND. ( DBLE( A( I, J ) ).NE.ZERO .OR.
+     $                DIMAG( A( I, J ) ).NE.ZERO ) ) THEN
+                     CANSWAP = .FALSE.
+                     EXIT
+                  END IF
+               END DO
+*
+               IF( CANSWAP ) THEN
+                  SCALE( K ) = J
+                  IF( J.NE.K ) THEN
+                     CALL ZSWAP( L, A( 1, J ), 1, A( 1, K ), 1 )
+                     CALL ZSWAP( N-K+1, A( J, K ), LDA, A( K, K ), LDA )
+                  END IF
+                  NOCONV = .TRUE.
+*
+                  K = K + 1
+               END IF
+            END DO
+*
+         END DO
 *
-         DO 100 I = K, L
-            IF( I.EQ.J )
-     $         GO TO 100
-            IF( DBLE( A( I, J ) ).NE.ZERO .OR. DIMAG( A( I, J ) ).NE.
-     $          ZERO )GO TO 110
-  100    CONTINUE
+      END IF
 *
-         M = K
-         IEXC = 2
-         GO TO 20
-  110 CONTINUE
+*     Initialize SCALE for non-permuted submatrix.
 *
-  120 CONTINUE
-      DO 130 I = K, L
+      DO I = K, L
          SCALE( I ) = ONE
-  130 CONTINUE
+      END DO
 *
-      IF( LSAME( JOB, 'P' ) )
-     $   GO TO 210
+*     If we only had to permute, we are done.
+*
+      IF( LSAME( JOB, 'P' ) ) THEN
+         ILO = K
+         IHI = L
+         RETURN
+      END IF
 *
 *     Balance the submatrix in rows K to L.
 *
-*     Iterative loop for norm reduction
+*     Iterative loop for norm reduction.
 *
       SFMIN1 = DLAMCH( 'S' ) / DLAMCH( 'P' )
       SFMAX1 = ONE / SFMIN1
       SFMIN2 = SFMIN1*SCLFAC
       SFMAX2 = ONE / SFMIN2
-  140 CONTINUE
-      NOCONV = .FALSE.
-*
-      DO 200 I = K, L
-*
-         C = DZNRM2( L-K+1, A( K, I ), 1 )
-         R = DZNRM2( L-K+1, A( I, K ), LDA )
-         ICA = IZAMAX( L, A( 1, I ), 1 )
-         CA = ABS( A( ICA, I ) )
-         IRA = IZAMAX( N-K+1, A( I, K ), LDA )
-         RA = ABS( A( I, IRA+K-1 ) )
-*
-*        Guard against zero C or R due to underflow.
-*
-         IF( C.EQ.ZERO .OR. R.EQ.ZERO )
-     $      GO TO 200
-         G = R / SCLFAC
-         F = ONE
-         S = C + R
-  160    CONTINUE
-         IF( C.GE.G .OR. MAX( F, C, CA ).GE.SFMAX2 .OR.
-     $       MIN( R, G, RA ).LE.SFMIN2 )GO TO 170
-            IF( DISNAN( C+F+CA+R+G+RA ) ) THEN
 *
-*           Exit if NaN to avoid infinite loop
+      NOCONV = .TRUE.
+      DO WHILE( NOCONV )
+         NOCONV = .FALSE.
 *
-            INFO = -3
-            CALL XERBLA( 'ZGEBAL', -INFO )
-            RETURN
-         END IF
-         F = F*SCLFAC
-         C = C*SCLFAC
-         CA = CA*SCLFAC
-         R = R / SCLFAC
-         G = G / SCLFAC
-         RA = RA / SCLFAC
-         GO TO 160
-*
-  170    CONTINUE
-         G = C / SCLFAC
-  180    CONTINUE
-         IF( G.LT.R .OR. MAX( R, RA ).GE.SFMAX2 .OR.
-     $       MIN( F, C, G, CA ).LE.SFMIN2 )GO TO 190
-         F = F / SCLFAC
-         C = C / SCLFAC
-         G = G / SCLFAC
-         CA = CA / SCLFAC
-         R = R*SCLFAC
-         RA = RA*SCLFAC
-         GO TO 180
-*
-*        Now balance.
-*
-  190    CONTINUE
-         IF( ( C+R ).GE.FACTOR*S )
-     $      GO TO 200
-         IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
-            IF( F*SCALE( I ).LE.SFMIN1 )
-     $         GO TO 200
-         END IF
-         IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
-            IF( SCALE( I ).GE.SFMAX1 / F )
-     $         GO TO 200
-         END IF
-         G = ONE / F
-         SCALE( I ) = SCALE( I )*F
-         NOCONV = .TRUE.
+         DO I = K, L
 *
-         CALL ZDSCAL( N-K+1, G, A( I, K ), LDA )
-         CALL ZDSCAL( L, F, A( 1, I ), 1 )
+            C = DZNRM2( L-K+1, A( K, I ), 1 )
+            R = DZNRM2( L-K+1, A( I, K ), LDA )
+            ICA = IZAMAX( L, A( 1, I ), 1 )
+            CA = ABS( A( ICA, I ) )
+            IRA = IZAMAX( N-K+1, A( I, K ), LDA )
+            RA = ABS( A( I, IRA+K-1 ) )
 *
-  200 CONTINUE
+*           Guard against zero C or R due to underflow.
+*
+            IF( C.EQ.ZERO .OR. R.EQ.ZERO ) CYCLE
+*
+*           Exit if NaN to avoid infinite loop
 *
-      IF( NOCONV )
-     $   GO TO 140
+            IF( DISNAN( C+CA+R+RA ) ) THEN
+               INFO = -3
+               CALL XERBLA( 'ZGEBAL', -INFO )
+               RETURN
+            END IF
+*
+            G = R / SCLFAC
+            F = ONE
+            S = C + R
+*
+            DO WHILE( C.LT.G .AND. MAX( F, C, CA ).LT.SFMAX2 .AND.
+     $                MIN( R, G, RA ).GT.SFMIN2 )
+               F = F*SCLFAC
+               C = C*SCLFAC
+               CA = CA*SCLFAC
+               R = R / SCLFAC
+               G = G / SCLFAC
+               RA = RA / SCLFAC
+            END DO
+*
+            G = C / SCLFAC
+*
+            DO WHILE( G.GE.R .AND. MAX( R, RA ).LT.SFMAX2 .AND.
+     $                MIN( F, C, G, CA ).GT.SFMIN2 )
+               F = F / SCLFAC
+               C = C / SCLFAC
+               G = G / SCLFAC
+               CA = CA / SCLFAC
+               R = R*SCLFAC
+               RA = RA*SCLFAC
+            END DO
+*
+*           Now balance.
+*
+            IF( ( C+R ).GE.FACTOR*S ) CYCLE
+            IF( F.LT.ONE .AND. SCALE( I ).LT.ONE ) THEN
+               IF( F*SCALE( I ).LE.SFMIN1 ) CYCLE
+            END IF
+            IF( F.GT.ONE .AND. SCALE( I ).GT.ONE ) THEN
+               IF( SCALE( I ).GE.SFMAX1 / F ) CYCLE
+            END IF
+            G = ONE / F
+            SCALE( I ) = SCALE( I )*F
+            NOCONV = .TRUE.
+*
+            CALL ZDSCAL( N-K+1, G, A( I, K ), LDA )
+            CALL ZDSCAL( L, F, A( 1, I ), 1 )
+*
+         END DO
+*
+      END DO
 *
-  210 CONTINUE
       ILO = K
       IHI = L
 *
diff --git a/lapack-netlib/SRC/zgebrd.f b/lapack-netlib/SRC/zgebrd.f
index f1791c6a4..c1a6169a7 100644
--- a/lapack-netlib/SRC/zgebrd.f
+++ b/lapack-netlib/SRC/zgebrd.f
@@ -122,7 +122,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,M,N).
+*>          The length of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MAX(M,N), otherwise.
 *>          For optimum performance LWORK >= (M+N)*NB, where NB
 *>          is the optimal blocksize.
 *>
@@ -147,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gebrd
 *
 *> \par Further Details:
 *  =====================
@@ -223,8 +224,8 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKOPT, MINMN, NB,
-     $                   NBMIN, NX, WS
+      INTEGER            I, IINFO, J, LDWRKX, LDWRKY, LWKMIN, LWKOPT,
+     $                   MINMN, NB, NBMIN, NX, WS
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, ZGEBD2, ZGEMM, ZLABRD
@@ -241,9 +242,17 @@
 *     Test the input parameters
 *
       INFO = 0
-      NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) )
-      LWKOPT = ( M+N )*NB
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = MAX( M, N )
+         NB = MAX( 1, ILAENV( 1, 'ZGEBRD', ' ', M, N, -1, -1 ) )
+         LWKOPT = ( M+N )*NB
+      END IF
       WORK( 1 ) = DBLE( LWKOPT )
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -251,7 +260,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.LT.0 ) THEN
@@ -263,7 +272,6 @@
 *
 *     Quick return if possible
 *
-      MINMN = MIN( M, N )
       IF( MINMN.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -282,7 +290,7 @@
 *        Determine when to switch from blocked to unblocked code.
 *
          IF( NX.LT.MINMN ) THEN
-            WS = ( M+N )*NB
+            WS = LWKOPT
             IF( LWORK.LT.WS ) THEN
 *
 *              Not enough work space for the optimal NB, consider using
diff --git a/lapack-netlib/SRC/zgecon.f b/lapack-netlib/SRC/zgecon.f
index 9cbfe35bc..ef567d7c2 100644
--- a/lapack-netlib/SRC/zgecon.f
+++ b/lapack-netlib/SRC/zgecon.f
@@ -105,8 +105,15 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>          = 0:  successful exit
-*>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          =-5:  if ANORM is NAN or negative.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>                NaNs are illegal values for ANORM, and they propagate to
+*>                the output parameter RCOND.
+*>                Infinity is illegal for ANORM, and it propagates to the output
+*>                parameter RCOND as 0.
+*>          = 1:  if RCOND = NaN, or
+*>                   RCOND = Inf, or
+*>                   the computed norm of the inverse of A is 0.
+*>                In the latter, RCOND = 0 is returned.
 *> \endverbatim
 *
 *  Authors:
@@ -117,7 +124,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gecon
 *
 *  =====================================================================
       SUBROUTINE ZGECON( NORM, N, A, LDA, ANORM, RCOND, WORK, RWORK,
@@ -147,7 +154,7 @@
       LOGICAL            ONENRM
       CHARACTER          NORMIN
       INTEGER            IX, KASE, KASE1
-      DOUBLE PRECISION   AINVNM, SCALE, SL, SMLNUM, SU
+      DOUBLE PRECISION   AINVNM, SCALE, SL, SMLNUM, SU, HUGEVAL
       COMPLEX*16         ZDUM
 *     ..
 *     .. Local Arrays ..
@@ -172,6 +179,8 @@
       CABS1( ZDUM ) = ABS( DBLE( ZDUM ) ) + ABS( DIMAG( ZDUM ) )
 *     ..
 *     .. Executable Statements ..
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
 *
 *     Test the input parameters.
 *
@@ -183,7 +192,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( ANORM.LT.ZERO .OR. DISNAN( ANORM ) ) THEN
+      ELSE IF( ANORM.LT.ZERO ) THEN
          INFO = -5
       END IF
       IF( INFO.NE.0 ) THEN
@@ -199,6 +208,13 @@
          RETURN
       ELSE IF( ANORM.EQ.ZERO ) THEN
          RETURN
+      ELSE IF( DISNAN( ANORM ) ) THEN
+         RCOND = ANORM
+         INFO = -5
+         RETURN
+      ELSE IF( ANORM.GT.HUGEVAL ) THEN
+         INFO = -5
+         RETURN
       END IF
 *
       SMLNUM = DLAMCH( 'Safe minimum' )
@@ -256,8 +272,17 @@
 *
 *     Compute the estimate of the reciprocal condition number.
 *
-      IF( AINVNM.NE.ZERO )
-     $   RCOND = ( ONE / AINVNM ) / ANORM
+      IF( AINVNM.NE.ZERO ) THEN
+         RCOND = ( ONE / AINVNM ) / ANORM
+      ELSE
+         INFO = 1
+         RETURN
+      END IF
+*
+*     Check for NaNs and Infs
+*
+      IF( DISNAN( RCOND ) .OR. RCOND.GT.HUGEVAL )
+     $   INFO = 1
 *
    20 CONTINUE
       RETURN
diff --git a/lapack-netlib/SRC/zgedmd.c b/lapack-netlib/SRC/zgedmd.c
new file mode 100644
index 000000000..c1b39ba3e
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmd.c
@@ -0,0 +1,1676 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+static integer c__1 = 1;
+static integer c__0 = 0;
+
+/* Subroutine */ int zgedmd_(char *jobs, char *jobz, char *jobr, char *jobf, 
+	integer *whtsvd, integer *m, integer *n, doublecomplex *x, integer *
+	ldx, doublecomplex *y, integer *ldy, integer *nrnk, doublereal *tol, 
+	integer *k, doublecomplex *eigs, doublecomplex *z__, integer *ldz, 
+	doublereal *res, doublecomplex *b, integer *ldb, doublecomplex *w, 
+	integer *ldw, doublecomplex *s, integer *lds, doublecomplex *zwork, 
+	integer *lzwork, doublereal *rwork, integer *lrwork, integer *iwork, 
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer x_dim1, x_offset, y_dim1, y_offset, z_dim1, z_offset, b_dim1, 
+	    b_offset, w_dim1, w_offset, s_dim1, s_offset, i__1, i__2, i__3, 
+	    i__4, i__5;
+    doublereal d__1, d__2;
+    doublecomplex z__1, z__2;
+
+    /* Local variables */
+    doublecomplex zone;
+    doublereal zero, ssum;
+    integer info1, info2;
+    doublereal xscl1, xscl2;
+    integer i__, j;
+    doublereal scale;
+    extern logical lsame_(char *, char *);
+    logical badxy;
+    doublereal small;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    char jobzl[1];
+    extern /* Subroutine */ int zgeev_(char *, char *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *);
+    logical wntex;
+    doublecomplex zzero;
+    extern /* Subroutine */ int zaxpy_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *);
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *), dlamch_(
+	    char *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    char t_or_n__[1];
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
+	    doublecomplex *, integer *), zgesdd_(char *, integer *, integer *,
+	     doublecomplex *, integer *, doublereal *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *, integer *), zlascl_(char *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    integer *, doublecomplex *, integer *, integer *);
+    extern integer izamax_(integer *, doublecomplex *, integer *);
+    logical sccolx, sccoly;
+    integer lwrsdd, mwrsdd;
+    extern /* Subroutine */ int zgesvd_(char *, char *, integer *, integer *, 
+	    doublecomplex *, integer *, doublereal *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublecomplex *, integer *,
+	     doublereal *, integer *), zlacpy_(char *, 
+	    integer *, integer *, doublecomplex *, integer *, doublecomplex *,
+	     integer *);
+    integer iminwr;
+    logical wntref, wntvec;
+    doublereal rootsc;
+    integer lwrkev, mlwork, mwrkev, numrnk, olwork, lwrsvd, mwrsvd, mlrwrk;
+    logical lquery, wntres;
+    char jsvopt[1];
+    integer lwrsvj, mwrsvj;
+    doublereal rdummy[2];
+    extern /* Subroutine */ int zgejsv_(char *, char *, char *, char *, char *
+	    , char *, integer *, integer *, doublecomplex *, integer *, 
+	    doublereal *, doublecomplex *, integer *, doublecomplex *, 
+	    integer *, doublecomplex *, integer *, doublereal *, integer *, 
+	    integer *, integer *), zlassq_(integer *, doublecomplex *, integer *, 
+	    doublereal *, doublereal *), mecago_();
+    integer lwrsvq, mwrsvq;
+    doublereal ofl, one;
+    extern /* Subroutine */ int zgesvdq_(char *, char *, char *, char *, char 
+	    *, integer *, integer *, doublecomplex *, integer *, doublereal *,
+	     doublecomplex *, integer *, doublecomplex *, integer *, integer *
+	    , integer *, integer *, doublecomplex *, integer *, doublereal *, 
+	    integer *, integer *);
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ............................................................ */
+/*     Purpose */
+/*     ======= */
+/*     ZGEDMD computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, ZGEDMD computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, ZGEDMD returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/* ...................................................................... */
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ............................................................ */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product X(:,1:K)*W, where X */
+/*            contains a POD basis (leading left singular vectors */
+/*            of the data matrix X) and W contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of K, X, W, Z. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will be */
+/*            computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: ZGESVD (the QR SVD algorithm) */
+/*     2 :: ZGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M>= 0 */
+/*     The state space dimension (the row dimension of X, Y). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshot pairs */
+/*     (the number of columns of X and Y). */
+/* ..... */
+/*     X (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, X contains the data snapshot matrix X. It is */
+/*     assumed that the column norms of X are in the range of */
+/*     the normalized floating point numbers. */
+/*   < On exit, the leading K columns of X contain a POD basis, */
+/*     i.e. the leading K left singular vectors of the input */
+/*     data matrix X, U(:,1:K). All N columns of X contain all */
+/*     left singular vectors of the input matrix X. */
+/*     See the descriptions of K, Z and W. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= M */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array */
+/*   > On entry, Y contains the data snapshot matrix Y */
+/*   < On exit, */
+/*     If JOBR == 'R', the leading K columns of Y  contain */
+/*     the residual vectors for the computed Ritz pairs. */
+/*     See the description of RES. */
+/*     If JOBR == 'N', Y contains the original input data, */
+/*                     scaled according to the value of JOBS. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= M */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the descriptions of TOL and  K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the POD basis for the data snapshot */
+/*     matrix X and the number of the computed Ritz pairs. */
+/*     The value of K is determined according to the rule set */
+/*     by the parameters NRNK and TOL. */
+/*     See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) N-by-1 array */
+/*     The leading K (K<=N) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array */
+/*     If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of W(1:k,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and W. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) N-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  M-by-N array. */
+/*     IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:M,1:K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     If JOBF =='N', then B is not referenced. */
+/*     See the descriptions of X, W, K. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= M */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     W (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     On exit, W(1:K,1:K) contains the K computed */
+/*     eigenvectors of the matrix Rayleigh quotient. */
+/*     The Ritz vectors (returned in Z) are the */
+/*     product of X (containing a POD basis for the input */
+/*     matrix X) and W. See the descriptions of K, S, X and Z. */
+/*     W is also used as a workspace to temporarily store the */
+/*     right singular vectors of X. */
+/* ..... */
+/*     LDW (input) INTEGER, LDW >= N */
+/*     The leading dimension of the array W. */
+/* ..... */
+/*     S (workspace/output) COMPLEX(KIND=WP) N-by-N array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by ZGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array */
+/*     ZWORK is used as complex workspace in the complex SVD, as */
+/*     specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing */
+/*     the eigenvalues of a Rayleigh quotient. */
+/*     If the call to ZGEDMD is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the workspace vector ZWORK. */
+/*     LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV), */
+/*     where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal */
+/*     LZWORK_SVD is calculated as follows */
+/*     If WHTSVD == 1 :: ZGESVD :: */
+/*        LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/*     If WHTSVD == 2 :: ZGESDD :: */
+/*        LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N) */
+/*     If WHTSVD == 3 :: ZGESVDQ :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: ZGEJSV :: */
+/*        LZWORK_SVD = obtainable by a query */
+/*     If on entry LZWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths and returns them in */
+/*     LZWORK(1) and LZWORK(2), respectively. */
+/* ..... */
+/*     RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array */
+/*     On exit, RWORK(1:N) contains the singular values of */
+/*     X (for JOBS=='N') or column scaled X (JOBS=='S', 'C'). */
+/*     If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain */
+/*     scaling factor RWORK(N+2)/RWORK(N+1) used to scale X */
+/*     and Y to avoid overflow in the SVD of X. */
+/*     This may be of interest if the scaling option is off */
+/*     and as many as possible smallest eigenvalues are */
+/*     desired to the highest feasible accuracy. */
+/*     If the call to ZGEDMD is only workspace query, then */
+/*     RWORK(1) contains the minimal workspace length. */
+/*     See the description of LRWORK. */
+/* ..... */
+/*     LRWORK (input) INTEGER */
+/*     The minimal length of the workspace vector RWORK. */
+/*     LRWORK is calculated as follows: */
+/*     LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where */
+/*     LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace */
+/*     for the SVD subroutine determined by the input parameter */
+/*     WHTSVD. */
+/*     If WHTSVD == 1 :: ZGESVD :: */
+/*        LRWORK_SVD = 5*MIN(M,N) */
+/*     If WHTSVD == 2 :: ZGESDD :: */
+/*        LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), */
+/*        2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) ) */
+/*     If WHTSVD == 3 :: ZGESVDQ :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If WHTSVD == 4 :: ZGEJSV :: */
+/*        LRWORK_SVD = obtainable by a query */
+/*     If on entry LRWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     real workspace length and returns it in RWORK(1). */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for  ZWORK, RWORK and */
+/*     IWORK. See the descriptions of ZWORK, RWORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     Local arrays */
+/*     ~~~~~~~~~~~~ */
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* ............................................................ */
+    /* Parameter adjustments */
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    w_dim1 = *ldw;
+    w_offset = 1 + w_dim1 * 1;
+    w -= w_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    zero = 0.f;
+    one = 1.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+    zone.r = 1.f, zone.i = 0.f;
+
+/*    Test the input arguments */
+
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    *info = 0;
+    lquery = *lzwork == -1 || *liwork == -1 || *lrwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || lsame_(jobz, "N") || lsame_(
+	    jobz, "F"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && ! wntvec) {
+	*info = -3;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -4;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -5;
+    } else if (*m < 0) {
+	*info = -6;
+    } else if (*n < 0 || *n > *m) {
+	*info = -7;
+    } else if (*ldx < *m) {
+	*info = -9;
+    } else if (*ldy < *m) {
+	*info = -11;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -12;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -13;
+    } else if (*ldz < *m) {
+	*info = -17;
+    } else if ((wntref || wntex) && *ldb < *m) {
+	*info = -20;
+    } else if (*ldw < *n) {
+	*info = -22;
+    } else if (*lds < *n) {
+	*info = -24;
+    }
+
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0) {
+/* Quick return. All output except K is void. */
+/* INFO=1 signals the void input. */
+/* In case of a workspace query, the default */
+/* minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		rwork[1] = 1.;
+		zwork[1].r = 2., zwork[1].i = 0.;
+		zwork[2].r = 2., zwork[2].i = 0.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	iminwr = 1;
+	mlrwrk = f2cmax(1,*n);
+	mlwork = 2;
+	olwork = 2;
+/*         SELECT CASE ( WHTSVD ) */
+	if (*whtsvd == 1) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of ZGESVD: */
+/* MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N)) */
+/* Computing MAX */
+	    i__1 = 1, i__2 = (f2cmin(*m,*n) << 1) + f2cmax(*m,*n);
+	    mwrsvd = f2cmax(i__1,i__2);
+	    mlwork = f2cmax(mlwork,mwrsvd);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + f2cmin(*m,*n) * 5;
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[
+			b_offset], ldb, &w[w_offset], ldw, &zwork[1], &c_n1, 
+			rdummy, &info1);
+		lwrsvd = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvd);
+	    }
+	} else if (*whtsvd == 2) {
+/* The following is specified as the minimal */
+/* length of WORK in the definition of ZGESDD: */
+/* MWRSDD = 2*f2cmin(M,N)*f2cmin(M,N)+2*f2cmin(M,N)+f2cmax(M,N). */
+/* RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N) */
+/* In LAPACK 3.10.1 RWORK is defined differently. */
+/* Below we take f2cmax over the two versions. */
+/* IMINWR = 8*MIN(M,N) */
+	    mwrsdd = (f2cmin(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n) << 1) + f2cmax(
+		    *m,*n);
+	    mlwork = f2cmax(mlwork,mwrsdd);
+	    iminwr = f2cmin(*m,*n) << 3;
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = f2cmin(*m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 7, i__4 = f2cmin(*
+		    m,*n) * 5 * f2cmin(*m,*n) + f2cmin(*m,*n) * 5, i__3 = f2cmax(i__3,
+		    i__4), i__4 = (f2cmax(*m,*n) << 1) * f2cmin(*m,*n) + (f2cmin(*m,*n)
+		     << 1) * f2cmin(*m,*n) + f2cmin(*m,*n);
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset],
+			 ldb, &w[w_offset], ldw, &zwork[1], &c_n1, rdummy, &
+			iwork[1], &info1);
+/* Computing MAX */
+		i__1 = mwrsdd, i__2 = (integer) zwork[1].r;
+		lwrsdd = f2cmax(i__1,i__2);
+/* Possible bug in ZGESDD optimal workspace size. */
+		olwork = f2cmax(olwork,lwrsdd);
+	    }
+	} else if (*whtsvd == 3) {
+	    zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[
+		    1], &z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &
+		    iwork[1], &c_n1, &zwork[1], &c_n1, rdummy, &c_n1, &info1);
+	    iminwr = iwork[1];
+	    mwrsvq = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvq);
+/* Computing MAX */
+	    i__1 = mlrwrk, i__2 = *n + (integer) rdummy[0];
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvq = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvq);
+	    }
+	} else if (*whtsvd == 4) {
+	    *(unsigned char *)jsvopt = 'J';
+	    zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, 
+		    &rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[
+		    1], &c_n1, rdummy, &c_n1, &iwork[1], &info1);
+	    iminwr = iwork[1];
+	    mwrsvj = (integer) zwork[2].r;
+	    mlwork = f2cmax(mlwork,mwrsvj);
+/* Computing MAX */
+/* Computing MAX */
+	    i__3 = 7, i__4 = (integer) rdummy[0];
+	    i__1 = mlrwrk, i__2 = *n + f2cmax(i__3,i__4);
+	    mlrwrk = f2cmax(i__1,i__2);
+	    if (lquery) {
+		lwrsvj = (integer) zwork[1].r;
+		olwork = f2cmax(olwork,lwrsvj);
+	    }
+/*         END SELECT */
+	}
+	if (wntvec || wntex || lsame_(jobz, "F")) {
+	    *(unsigned char *)jobzl = 'V';
+	} else {
+	    *(unsigned char *)jobzl = 'N';
+	}
+/* Workspace calculation to the ZGEEV call */
+/* Computing MAX */
+	i__1 = 1, i__2 = *n << 1;
+	mwrkev = f2cmax(i__1,i__2);
+	mlwork = f2cmax(mlwork,mwrkev);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = *n + (*n << 1);
+	mlrwrk = f2cmax(i__1,i__2);
+	if (lquery) {
+	    zgeev_("N", jobzl, n, &s[s_offset], lds, &eigs[1], &w[w_offset], 
+		    ldw, &w[w_offset], ldw, &zwork[1], &c_n1, &rwork[1], &
+		    info1);
+	    lwrkev = (integer) zwork[1].r;
+	    olwork = f2cmax(olwork,lwrkev);
+	}
+
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -30;
+	}
+	if (*lrwork < mlrwrk && ! lquery) {
+	    *info = -28;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -26;
+	}
+    }
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEDMD", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	rwork[1] = (doublereal) mlrwrk;
+	zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.;
+	zwork[2].r = (doublereal) olwork, zwork[2].i = 0.;
+	return 0;
+    }
+/* ............................................................ */
+
+    ofl = dlamch_("O");
+    small = dlamch_("S");
+    badxy = FALSE_;
+
+/*     <1> Optional scaling of the snapshots (columns of X, Y) */
+/*     ========================================================== */
+    if (sccolx) {
+/* The columns of X will be normalized. */
+/* To prevent overflows, the column norms of X are */
+/* carefully computed using ZLASSQ. */
+	*k = 0;
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* WORK(i) = DZNRM2( M, X(1,i), 1 ) */
+	    scale = zero;
+	    zlassq_(m, &x[i__ * x_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -8;
+		i__2 = -(*info);
+		xerbla_("ZGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of X(:,i) overflows. First, X(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2. */
+/*                 Next, the norm of X(:,i) is stored without */
+/*                 overflow as RWORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of X(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &x[
+			    i__ * x_dim1 + 1], ldx, &info2);
+		    rwork[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 X(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    x[i__ * x_dim1 + 1], ldx, &info2);
+/*                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)   ! INTRINSIC */
+/* LAPACK CALL */
+		}
+	    } else {
+		rwork[i__] = zero;
+		++(*k);
+	    }
+	}
+	if (*k == *n) {
+/* All columns of X are zero. Return error code -8. */
+/* (the 8th input variable had an illegal value) */
+	    *k = 0;
+	    *info = -8;
+	    i__1 = -(*info);
+	    xerbla_("ZGEDMD", &i__1);
+	    return 0;
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of Y. */
+	    if (rwork[i__] > zero) {
+		d__1 = one / rwork[i__];
+		zdscal_(m, &d__1, &y[i__ * y_dim1 + 1], &c__1);
+/*               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)       ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		d__1 = -rwork[i__];
+		d__2 = one / (doublereal) (*m);
+		zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &y[i__ * 
+			y_dim1 + 1], ldy, &info2);
+/* LAPACK C */
+	    } else if (z_abs(&y[izamax_(m, &y[i__ * y_dim1 + 1], &c__1) + i__ 
+		    * y_dim1]) != zero) {
+/*               X(:,i) is zero vector. For consistency, */
+/*               Y(:,i) should also be zero. If Y(:,i) is not */
+/*               zero, then the data might be inconsistent or */
+/*               corrupted. If JOBS == 'C', Y(:,i) is set to */
+/*               zero and a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+		if (lsame_(jobs, "C")) {
+		    zdscal_(m, &zero, &y[i__ * y_dim1 + 1], &c__1);
+		}
+/* BLAS CALL */
+	    }
+	}
+    }
+
+    if (sccoly) {
+/* The columns of Y will be normalized. */
+/* To prevent overflows, the column norms of Y are */
+/* carefully computed using ZLASSQ. */
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/* RWORK(i) = DZNRM2( M, Y(1,i), 1 ) */
+	    scale = zero;
+	    zlassq_(m, &y[i__ * y_dim1 + 1], &c__1, &scale, &ssum);
+	    if (disnan_(&scale) || disnan_(&ssum)) {
+		*k = 0;
+		*info = -10;
+		i__2 = -(*info);
+		xerbla_("ZGEDMD", &i__2);
+	    }
+	    if (scale != zero && ssum != zero) {
+		rootsc = sqrt(ssum);
+		if (scale >= ofl / rootsc) {
+/*                 Norm of Y(:,i) overflows. First, Y(:,i) */
+/*                 is scaled by */
+/*                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2. */
+/*                 Next, the norm of Y(:,i) is stored without */
+/*                 overflow as RWORK(i) = - SCALE * (ROOTSC/M), */
+/*                 the minus sign indicating the 1/M factor. */
+/*                 Scaling is performed without overflow, and */
+/*                 underflow may occur in the smallest entries */
+/*                 of Y(:,i). The relative backward and forward */
+/*                 errors are small in the ell_2 norm. */
+		    d__1 = one / rootsc;
+		    zlascl_("G", &c__0, &c__0, &scale, &d__1, m, &c__1, &y[
+			    i__ * y_dim1 + 1], ldy, &info2);
+		    rwork[i__] = -scale * (rootsc / (doublereal) (*m));
+		} else {
+/*                 Y(:,i) will be scaled to unit 2-norm */
+		    rwork[i__] = scale * rootsc;
+		    zlascl_("G", &c__0, &c__0, &rwork[i__], &one, m, &c__1, &
+			    y[i__ * y_dim1 + 1], ldy, &info2);
+/*                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC */
+/* LAPAC */
+		}
+	    } else {
+		rwork[i__] = zero;
+	    }
+	}
+	i__1 = *n;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+/*           Now, apply the same scaling to the columns of X. */
+	    if (rwork[i__] > zero) {
+		d__1 = one / rwork[i__];
+		zdscal_(m, &d__1, &x[i__ * x_dim1 + 1], &c__1);
+/*               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC */
+/* BLAS CALL */
+	    } else if (rwork[i__] < zero) {
+		d__1 = -rwork[i__];
+		d__2 = one / (doublereal) (*m);
+		zlascl_("G", &c__0, &c__0, &d__1, &d__2, m, &c__1, &x[i__ * 
+			x_dim1 + 1], ldx, &info2);
+/* LAPACK C */
+	    } else if (z_abs(&x[izamax_(m, &x[i__ * x_dim1 + 1], &c__1) + i__ 
+		    * x_dim1]) != zero) {
+/*               Y(:,i) is zero vector.  If X(:,i) is not */
+/*               zero, then a warning flag is raised. */
+/*               The computation continues but the */
+/*               situation will be reported in the output. */
+		badxy = TRUE_;
+	    }
+	}
+    }
+
+/*     <2> SVD of the data snapshot matrix X. */
+/*     ===================================== */
+/*     The left singular vectors are stored in the array X. */
+/*     The right singular vectors are in the array W. */
+/*     The array W will later on contain the eigenvectors */
+/*     of a Rayleigh quotient. */
+    numrnk = *n;
+/*      SELECT CASE ( WHTSVD ) */
+    if (*whtsvd == 1) {
+	zgesvd_("O", "S", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], 
+		ldb, &w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &
+		info1);
+/* LA */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 2) {
+	zgesdd_("O", m, n, &x[x_offset], ldx, &rwork[1], &b[b_offset], ldb, &
+		w[w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &iwork[1]
+		, &info1);
+/* LAP */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 3) {
+	i__1 = *lrwork - *n;
+	zgesvdq_("H", "P", "N", "R", "R", m, n, &x[x_offset], ldx, &rwork[1], 
+		&z__[z_offset], ldz, &w[w_offset], ldw, &numrnk, &iwork[1], 
+		liwork, &zwork[1], lzwork, &rwork[*n + 1], &i__1, &info1);
+/* LAPACK CA */
+	zlacpy_("A", m, &numrnk, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK C */
+	*(unsigned char *)t_or_n__ = 'C';
+    } else if (*whtsvd == 4) {
+	i__1 = *lrwork - *n;
+	zgejsv_("F", "U", jsvopt, "R", "N", "P", m, n, &x[x_offset], ldx, &
+		rwork[1], &z__[z_offset], ldz, &w[w_offset], ldw, &zwork[1], 
+		lzwork, &rwork[*n + 1], &i__1, &iwork[1], &info1);
+	zlacpy_("A", m, n, &z__[z_offset], ldz, &x[x_offset], ldx);
+/* LAPACK CALL */
+	*(unsigned char *)t_or_n__ = 'N';
+	xscl1 = rwork[*n + 1];
+	xscl2 = rwork[*n + 2];
+	if (xscl1 != xscl2) {
+/* This is an exceptional situation. If the */
+/* data matrices are not scaled and the */
+/* largest singular value of X overflows. */
+/* In that case ZGEJSV can return the SVD */
+/* in scaled form. The scaling factor can be used */
+/* to rescale the data (X and Y). */
+	    zlascl_("G", &c__0, &c__0, &xscl1, &xscl2, m, n, &y[y_offset], 
+		    ldy, &info2);
+	}
+/*      END SELECT */
+    }
+
+    if (info1 > 0) {
+/* The SVD selected subroutine did not converge. */
+/* Return with an error code. */
+	*info = 2;
+	return 0;
+    }
+
+    if (rwork[1] == zero) {
+/* The largest computed singular value of (scaled) */
+/* X is zero. Return error code -8 */
+/* (the 8th input variable had an illegal value). */
+	*k = 0;
+	*info = -8;
+	i__1 = -(*info);
+	xerbla_("ZGEDMD", &i__1);
+	return 0;
+    }
+
+/* <3> Determine the numerical rank of the data */
+/*    snapshots matrix X. This depends on the */
+/*    parameters NRNK and TOL. */
+/*      SELECT CASE ( NRNK ) */
+    if (*nrnk == -1) {
+	*k = 1;
+	i__1 = numrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= rwork[1] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else if (*nrnk == -2) {
+	*k = 1;
+	i__1 = numrnk - 1;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    if (rwork[i__ + 1] <= rwork[i__] * *tol || rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+    } else {
+	*k = 1;
+	i__1 = *nrnk;
+	for (i__ = 2; i__ <= i__1; ++i__) {
+	    if (rwork[i__] <= small) {
+		myexit_();
+	    }
+	    ++(*k);
+	}
+/*          END SELECT */
+    }
+/*   Now, U = X(1:M,1:K) is the SVD/POD basis for the */
+/*   snapshot data in the input matrix X. */
+/* <4> Compute the Rayleigh quotient S = U^H * A * U. */
+/*    Depending on the requested outputs, the computation */
+/*    is organized to compute additional auxiliary */
+/*    matrices (for the residuals and refinements). */
+
+/*    In all formulas below, we need V_k*Sigma_k^(-1) */
+/*    where either V_k is in W(1:N,1:K), or V_k^H is in */
+/*    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)). */
+    if (lsame_(t_or_n__, "N")) {
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    d__1 = one / rwork[i__];
+	    zdscal_(n, &d__1, &w[i__ * w_dim1 + 1], &c__1);
+/* W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC */
+/* BLAS CALL */
+	}
+    } else {
+/* This non-unit stride access is due to the fact */
+/* that ZGESVD, ZGESVDQ and ZGESDD return the */
+/* adjoint matrix of the right singular vectors. */
+/* DO i = 1, K */
+/* CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW )    ! BLAS CALL */
+/* ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC */
+/* END DO */
+	i__1 = *k;
+	for (i__ = 1; i__ <= i__1; ++i__) {
+	    rwork[*n + i__] = one / rwork[i__];
+	}
+	i__1 = *n;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = *k;
+	    for (i__ = 1; i__ <= i__2; ++i__) {
+		i__3 = i__ + j * w_dim1;
+		i__4 = *n + i__;
+		z__2.r = rwork[i__4], z__2.i = zero;
+		i__5 = i__ + j * w_dim1;
+		z__1.r = z__2.r * w[i__5].r - z__2.i * w[i__5].i, z__1.i = 
+			z__2.r * w[i__5].i + z__2.i * w[i__5].r;
+		w[i__3].r = z__1.r, w[i__3].i = z__1.i;
+	    }
+	}
+    }
+
+    if (wntref) {
+
+/* Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K))) */
+/* for computing the refined Ritz vectors */
+/* (optionally, outside ZGEDMD). */
+	zgemm_("N", t_or_n__, m, k, n, &zone, &y[y_offset], ldy, &w[w_offset],
+		 ldw, &zzero, &z__[z_offset], ldz);
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */
+/* Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))                   ! */
+
+/* At this point Z contains */
+/* A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and */
+/* this is needed for computing the residuals. */
+/* This matrix is  returned in the array B and */
+/* it can be used to compute refined Ritz vectors. */
+/* BLA */
+	zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+/* B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC */
+/* BLAS CALL */
+	zgemm_("C", "N", k, k, m, &zone, &x[x_offset], ldx, &z__[z_offset], 
+		ldz, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* BLA */
+    } else {
+/* A * U(:,1:K) is not explicitly needed and the */
+/* computation is organized differently. The Rayleigh */
+/* quotient is computed more efficiently. */
+	zgemm_("C", "N", k, n, m, &zone, &x[x_offset], ldx, &y[y_offset], ldy,
+		 &zzero, &z__[z_offset], ldz);
+/* Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) */
+
+	zgemm_("N", t_or_n__, k, k, n, &zone, &z__[z_offset], ldz, &w[
+		w_offset], ldw, &zzero, &s[s_offset], lds);
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! */
+/* S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))                 ! */
+/* At this point S = U^H * A * U is the Rayleigh quotient. */
+/* If the residuals are requested, save scaled V_k into Z. */
+/* Recall that V_k or V_k^H is stored in W. */
+/* BLAS */
+	if (wntres || wntex) {
+	    if (lsame_(t_or_n__, "N")) {
+		zlacpy_("A", n, k, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    } else {
+		zlacpy_("A", k, n, &w[w_offset], ldw, &z__[z_offset], ldz);
+	    }
+	}
+    }
+
+/* <5> Compute the Ritz values and (if requested) the */
+/*   right eigenvectors of the Rayleigh quotient. */
+
+    zgeev_("N", jobzl, k, &s[s_offset], lds, &eigs[1], &w[w_offset], ldw, &w[
+	    w_offset], ldw, &zwork[1], lzwork, &rwork[*n + 1], &info1);
+
+/* W(1:K,1:K) contains the eigenvectors of the Rayleigh */
+/* quotient.  See the description of Z. */
+/* Also, see the description of ZGEEV. */
+/* LAPACK CALL */
+    if (info1 > 0) {
+/* ZGEEV failed to compute the eigenvalues and */
+/* eigenvectors of the Rayleigh quotient. */
+	*info = 3;
+	return 0;
+    }
+
+/* <6> Compute the eigenvectors (if requested) and, */
+/* the residuals (if requested). */
+
+    if (wntvec || wntex) {
+	if (wntres) {
+	    if (wntref) {
+/* Here, if the refinement is requested, we have */
+/* A*U(:,1:K) already computed and stored in Z. */
+/* For the residuals, need Y = A * U(:,1;K) * W. */
+		zgemm_("N", "N", m, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &y[y_offset], ldy);
+/* Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)        ! INTRINSIC */
+/* This frees Z; Y contains A * U(:,1:K) * W. */
+/* BLAS CALL */
+	    } else {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) (or its adjoint) is stored in Z */
+		zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+			w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+		zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[
+			s_offset], lds, &zzero, &z__[z_offset], ldz);
+/* Save a copy of Z into Y and free Z for holding */
+/* the Ritz vectors. */
+		zlacpy_("A", m, k, &z__[z_offset], ldz, &y[y_offset], ldy);
+		if (wntex) {
+		    zlacpy_("A", m, k, &z__[z_offset], ldz, &b[b_offset], ldb);
+		}
+	    }
+	} else if (wntex) {
+/* Compute S = V_k * Sigma_k^(-1) * W, where */
+/* V_k * Sigma_k^(-1) is stored in Z */
+	    zgemm_(t_or_n__, "N", n, k, k, &zone, &z__[z_offset], ldz, &w[
+		    w_offset], ldw, &zzero, &s[s_offset], lds);
+/* Then, compute Z = Y * S = */
+/* = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) = */
+/* = A * U(:,1:K) * W(1:K,1:K) */
+	    zgemm_("N", "N", m, k, n, &zone, &y[y_offset], ldy, &s[s_offset], 
+		    lds, &zzero, &b[b_offset], ldb);
+/* The above call replaces the following two calls */
+/* that were used in the developing-testing phase. */
+/* CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, & */
+/*           LDS, ZZERO, Z, LDZ) */
+/* Save a copy of Z into B and free Z for holding */
+/* the Ritz vectors. */
+/* CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB ) */
+	}
+
+/* Compute the Ritz vectors */
+	if (wntvec) {
+	    zgemm_("N", "N", m, k, k, &zone, &x[x_offset], ldx, &w[w_offset], 
+		    ldw, &zzero, &z__[z_offset], ldz);
+	}
+/* Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC */
+
+/* BLAS CALL */
+	if (wntres) {
+	    i__1 = *k;
+	    for (i__ = 1; i__ <= i__1; ++i__) {
+		i__2 = i__;
+		z__1.r = -eigs[i__2].r, z__1.i = -eigs[i__2].i;
+		zaxpy_(m, &z__1, &z__[i__ * z_dim1 + 1], &c__1, &y[i__ * 
+			y_dim1 + 1], &c__1);
+/* Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! INTR */
+/* BLAS */
+		res[i__] = dznrm2_(m, &y[i__ * y_dim1 + 1], &c__1);
+/* BLAS */
+	    }
+	}
+    }
+
+    if (*whtsvd == 4) {
+	rwork[*n + 1] = xscl1;
+	rwork[*n + 2] = xscl2;
+    }
+
+/*     Successful exit. */
+    if (! badxy) {
+	*info = 0;
+    } else {
+/* A warning on possible data inconsistency. */
+/* This should be a rare event. */
+	*info = 4;
+    }
+/* ............................................................ */
+    return 0;
+/*     ...... */
+} /* zgedmd_ */
+
diff --git a/lapack-netlib/SRC/zgedmd.f90 b/lapack-netlib/SRC/zgedmd.f90
new file mode 100644
index 000000000..5045cb166
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmd.f90
@@ -0,0 +1,1148 @@
+!> \brief \b ZGEDMD computes the Dynamic Mode Decomposition (DMD) for a pair of data snapshot matrices.
+!
+!  =========== DOCUMENTATION ===========
+!
+!  Definition:
+!  ===========
+!
+!      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+!                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+!                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+!                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+!                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+!......
+!      USE                   iso_fortran_env
+!      IMPLICIT NONE
+!      INTEGER, PARAMETER :: WP = real64
+! 
+!......
+!      Scalar arguments
+!      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+!      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+!                                 NRNK, LDZ, LDB, LDW,  LDS, &
+!                                 LIWORK, LRWORK, LZWORK
+!      INTEGER,       INTENT(OUT)  :: K, INFO
+!      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!      Array arguments
+!      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+!      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+!                                         W(LDW,*), S(LDS,*)
+!      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+!      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+!      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+!      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+!      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
+!............................................................
+!>    \par Purpose:
+!     =============
+!>    \verbatim
+!>    ZGEDMD computes the Dynamic Mode Decomposition (DMD) for
+!>    a pair of data snapshot matrices. For the input matrices
+!>    X and Y such that Y = A*X with an unaccessible matrix
+!>    A, ZGEDMD computes a certain number of Ritz pairs of A using
+!>    the standard Rayleigh-Ritz extraction from a subspace of
+!>    range(X) that is determined using the leading left singular
+!>    vectors of X. Optionally, ZGEDMD returns the residuals
+!>    of the computed Ritz pairs, the information needed for
+!>    a refinement of the Ritz vectors, or the eigenvectors of
+!>    the Exact DMD.
+!>    For further details see the references listed
+!>    below. For more details of the implementation see [3].
+!>    \endverbatim
+!............................................................
+!>    \par References:
+!     ================
+!>    \verbatim
+!>    [1] P. Schmid: Dynamic mode decomposition of numerical
+!>        and experimental data,
+!>        Journal of Fluid Mechanics 656, 5-28, 2010.
+!>    [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!>        decompositions: analysis and enhancements,
+!>        SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!>    [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!>        Mode Decomposition I. Technical report. AIMDyn Inc.
+!>        and LAPACK Working Note 298.
+!>    [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L.
+!>        Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!>        Theory and Applications, Journal of Computational
+!>        Dynamics 1(2), 391 -421, 2014.
+!>    \endverbatim
+!......................................................................
+!>    \par Developed and supported by:
+!     ================================
+!>    \verbatim
+!>    Developed and coded by Zlatko Drmac, Faculty of Science,
+!>    University of Zagreb;  drmac@math.hr
+!>    In cooperation with
+!>    AIMdyn Inc., Santa Barbara, CA.
+!>    and supported by
+!>    - DARPA SBIR project "Koopman Operator-Based Forecasting
+!>    for Nonstationary Processes from Near-Term, Limited
+!>    Observational Data" Contract No: W31P4Q-21-C-0007
+!>    - DARPA PAI project "Physics-Informed Machine Learning
+!>    Methodologies" Contract No: HR0011-18-9-0033
+!>    - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!>    Framework for Space-Time Analysis of Process Dynamics"
+!>    Contract No: HR0011-16-C-0116
+!>    Any opinions, findings and conclusions or recommendations
+!>    expressed in this material are those of the author and
+!>    do not necessarily reflect the views of the DARPA SBIR
+!>    Program Office
+!>    \endverbatim
+!......................................................................
+!>    \par Distribution Statement A:
+!     ==============================
+!>    \verbatim
+!>    Approved for Public Release, Distribution Unlimited.
+!>    Cleared by DARPA on September 29, 2022
+!>    \endverbatim
+!............................................................
+!     Arguments
+!     =========
+!
+!>    \param[in] JOBS
+!>    \verbatim
+!>    JOBS (input) CHARACTER*1
+!>    Determines whether the initial data snapshots are scaled
+!>    by a diagonal matrix.
+!>    'S' :: The data snapshots matrices X and Y are multiplied
+!>           with a diagonal matrix D so that X*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'C' :: The snapshots are scaled as with the 'S' option.
+!>           If it is found that an i-th column of X is zero
+!>           vector and the corresponding i-th column of Y is
+!>           non-zero, then the i-th column of Y is set to
+!>           zero and a warning flag is raised.
+!>    'Y' :: The data snapshots matrices X and Y are multiplied
+!>           by a diagonal matrix D so that Y*D has unit
+!>           nonzero columns (in the Euclidean 2-norm)
+!>    'N' :: No data scaling.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBZ
+!>    \verbatim
+!>    JOBZ (input) CHARACTER*1
+!>    Determines whether the eigenvectors (Koopman modes) will
+!>    be computed.
+!>    'V' :: The eigenvectors (Koopman modes) will be computed
+!>           and returned in the matrix Z.
+!>           See the description of Z.
+!>    'F' :: The eigenvectors (Koopman modes) will be returned
+!>           in factored form as the product X(:,1:K)*W, where X
+!>           contains a POD basis (leading left singular vectors
+!>           of the data matrix X) and W contains the eigenvectors
+!>           of the corresponding Rayleigh quotient.
+!>           See the descriptions of K, X, W, Z.
+!>    'N' :: The eigenvectors are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBR
+!>    \verbatim
+!>    JOBR (input) CHARACTER*1
+!>    Determines whether to compute the residuals.
+!>    'R' :: The residuals for the computed eigenpairs will be
+!>           computed and stored in the array RES.
+!>           See the description of RES.
+!>           For this option to be legal, JOBZ must be 'V'.
+!>    'N' :: The residuals are not computed.
+!>    \endverbatim
+!.....
+!>    \param[in] JOBF
+!>    \verbatim
+!>    JOBF (input) CHARACTER*1
+!>    Specifies whether to store information needed for post-
+!>    processing (e.g. computing refined Ritz vectors)
+!>    'R' :: The matrix needed for the refinement of the Ritz
+!>           vectors is computed and stored in the array B.
+!>           See the description of B.
+!>    'E' :: The unscaled eigenvectors of the Exact DMD are
+!>           computed and returned in the array B. See the
+!>           description of B.
+!>    'N' :: No eigenvector refinement data is computed.
+!>    \endverbatim
+!.....
+!>    \param[in] WHTSVD
+!>    \verbatim
+!>    WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!>    Allows for a selection of the SVD algorithm from the
+!>    LAPACK library.
+!>    1 :: ZGESVD (the QR SVD algorithm)
+!>    2 :: ZGESDD (the Divide and Conquer algorithm; if enough
+!>         workspace available, this is the fastest option)
+!>    3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
+!>         are the most accurate options)
+!>    4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
+!>         are the most accurate options)
+!>    For the four methods above, a significant difference in
+!>    the accuracy of small singular values is possible if
+!>    the snapshots vary in norm so that X is severely
+!>    ill-conditioned. If small (smaller than EPS*||X||)
+!>    singular values are of interest and JOBS=='N',  then
+!>    the options (3, 4) give the most accurate results, where
+!>    the option 4 is slightly better and with stronger
+!>    theoretical background.
+!>    If JOBS=='S', i.e. the columns of X will be normalized,
+!>    then all methods give nearly equally accurate results.
+!>    \endverbatim
+!.....
+!>    \param[in] M
+!>    \verbatim
+!>    M (input) INTEGER, M>= 0
+!>    The state space dimension (the row dimension of X, Y).
+!>    \endverbatim
+!.....
+!>    \param[in] N
+!>    \verbatim
+!>    N (input) INTEGER, 0 <= N <= M
+!>    The number of data snapshot pairs
+!>    (the number of columns of X and Y).
+!>    \endverbatim
+!.....
+!>    \param[in] LDX
+!>    \verbatim
+!>    X (input/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, X contains the data snapshot matrix X. It is
+!>    assumed that the column norms of X are in the range of
+!>    the normalized floating point numbers.
+!>    < On exit, the leading K columns of X contain a POD basis,
+!>    i.e. the leading K left singular vectors of the input
+!>    data matrix X, U(:,1:K). All N columns of X contain all
+!>    left singular vectors of the input matrix X.
+!>    See the descriptions of K, Z and W.
+!.....
+!>    LDX (input) INTEGER, LDX >= M
+!>    The leading dimension of the array X.
+!>    \endverbatim
+!.....
+!>    \param[in,out] Y
+!>    \verbatim
+!>    Y (input/workspace/output) COMPLEX(KIND=WP) M-by-N array
+!>    > On entry, Y contains the data snapshot matrix Y
+!>    < On exit,
+!>    If JOBR == 'R', the leading K columns of Y  contain
+!>    the residual vectors for the computed Ritz pairs.
+!>    See the description of RES.
+!>    If JOBR == 'N', Y contains the original input data,
+!>                    scaled according to the value of JOBS.
+!>    \endverbatim
+!.....
+!>    \param[in] LDY
+!>    \verbatim
+!>    LDY (input) INTEGER , LDY >= M
+!>    The leading dimension of the array Y.
+!>    \endverbatim
+!.....
+!>    \param[in] NRNK
+!>    \verbatim
+!>    NRNK (input) INTEGER
+!>    Determines the mode how to compute the numerical rank,
+!>    i.e. how to truncate small singular values of the input
+!>    matrix X. On input, if
+!>    NRNK = -1 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(1)
+!>                 This option is recommended.
+!>    NRNK = -2 :: i-th singular value sigma(i) is truncated
+!>                 if sigma(i) <= TOL*sigma(i-1)
+!>                 This option is included for R&D purposes.
+!>                 It requires highly accurate SVD, which
+!>                 may not be feasible.
+!>    The numerical rank can be enforced by using positive
+!>    value of NRNK as follows:
+!>    0 < NRNK <= N :: at most NRNK largest singular values
+!>    will be used. If the number of the computed nonzero
+!>    singular values is less than NRNK, then only those
+!>    nonzero values will be used and the actually used
+!>    dimension is less than NRNK. The actual number of
+!>    the nonzero singular values is returned in the variable
+!>    K. See the descriptions of TOL and  K.
+!>    \endverbatim
+!.....
+!>    \param[in] TOL
+!>    \verbatim
+!>    TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!>    The tolerance for truncating small singular values.
+!>    See the description of NRNK.
+!>    \endverbatim
+!.....
+!>    \param[out] K
+!>    \verbatim
+!>    K (output) INTEGER,  0 <= K <= N
+!>    The dimension of the POD basis for the data snapshot
+!>    matrix X and the number of the computed Ritz pairs.
+!>    The value of K is determined according to the rule set
+!>    by the parameters NRNK and TOL.
+!>    See the descriptions of NRNK and TOL.
+!>    \endverbatim
+!.....
+!>    \param[out] EIGS
+!>    \verbatim
+!>    EIGS (output) COMPLEX(KIND=WP) N-by-1 array
+!>    The leading K (K<=N) entries of EIGS contain
+!>    the computed eigenvalues (Ritz values).
+!>    See the descriptions of K, and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] Z
+!>    \verbatim
+!>    Z (workspace/output) COMPLEX(KIND=WP)  M-by-N array
+!>    If JOBZ =='V' then Z contains the  Ritz vectors.  Z(:,i)
+!>    is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!>    If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!>    the columns of X(:,1:K)*W(1:K,1:K), i.e. X(:,1:K)*W(:,i)
+!>    is an eigenvector corresponding to EIGS(i). The columns
+!>    of W(1:k,1:K) are the computed eigenvectors of the
+!>    K-by-K Rayleigh quotient.
+!>    See the descriptions of EIGS, X and W.
+!>    \endverbatim
+!.....
+!>    \param[in] LDZ
+!>    \verbatim
+!>    LDZ (input) INTEGER , LDZ >= M
+!>    The leading dimension of the array Z.
+!>    \endverbatim
+!.....
+!>    \param[out] RES
+!>    \verbatim
+!>    RES (output) REAL(KIND=WP) N-by-1 array
+!>    RES(1:K) contains the residuals for the K computed
+!>    Ritz pairs,
+!>    RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!>    See the description of EIGS and Z.
+!>    \endverbatim
+!.....
+!>    \param[out] B
+!>    \verbatim
+!>    B (output) COMPLEX(KIND=WP)  M-by-N array.
+!>    IF JOBF =='R', B(1:M,1:K) contains A*U(:,1:K), and can
+!>    be used for computing the refined vectors; see further
+!>    details in the provided references.
+!>    If JOBF == 'E', B(1:M,1:K) contains
+!>    A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!>    Exact DMD, up to scaling by the inverse eigenvalues.
+!>    If JOBF =='N', then B is not referenced.
+!>    See the descriptions of X, W, K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDB
+!>    \verbatim
+!>    LDB (input) INTEGER, LDB >= M
+!>    The leading dimension of the array B.
+!>    \endverbatim
+!.....
+!>    \param[out] W
+!>    \verbatim
+!>    W (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    On exit, W(1:K,1:K) contains the K computed
+!>    eigenvectors of the matrix Rayleigh quotient.
+!>    The Ritz vectors (returned in Z) are the
+!>    product of X (containing a POD basis for the input
+!>    matrix X) and W. See the descriptions of K, S, X and Z.
+!>    W is also used as a workspace to temporarily store the
+!>    right singular vectors of X.
+!>    \endverbatim
+!.....
+!>    \param[in] LDW
+!>    \verbatim
+!>    LDW (input) INTEGER, LDW >= N
+!>    The leading dimension of the array W.
+!>    \endverbatim
+!.....
+!>    \param[out] S
+!>    \verbatim
+!>    S (workspace/output) COMPLEX(KIND=WP) N-by-N array
+!>    The array S(1:K,1:K) is used for the matrix Rayleigh
+!>    quotient. This content is overwritten during
+!>    the eigenvalue decomposition by ZGEEV.
+!>    See the description of K.
+!>    \endverbatim
+!.....
+!>    \param[in] LDS
+!>    \verbatim
+!>    LDS (input) INTEGER, LDS >= N
+!>    The leading dimension of the array S.
+!>    \endverbatim
+!.....
+!>    \param[out] ZWORK
+!>    \verbatim
+!>    ZWORK (workspace/output) COMPLEX(KIND=WP) LZWORK-by-1 array
+!>    ZWORK is used as complex workspace in the complex SVD, as
+!>    specified by WHTSVD (1,2, 3 or 4) and for ZGEEV for computing
+!>    the eigenvalues of a Rayleigh quotient.
+!>    If the call to ZGEDMD is only workspace query, then
+!>    ZWORK(1) contains the minimal complex workspace length and
+!>    ZWORK(2) is the optimal complex workspace length.
+!>    Hence, the length of work is at least 2.
+!>    See the description of LZWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LZWORK
+!>    \verbatim
+!>    LZWORK (input) INTEGER
+!>    The minimal length of the workspace vector ZWORK.
+!>    LZWORK is calculated as MAX(LZWORK_SVD, LZWORK_ZGEEV),
+!>    where LZWORK_ZGEEV = MAX( 1, 2*N )  and the minimal
+!>    LZWORK_SVD is calculated as follows
+!>    If WHTSVD == 1 :: ZGESVD ::
+!>       LZWORK_SVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+!>    If WHTSVD == 2 :: ZGESDD ::
+!>       LZWORK_SVD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+!>    If WHTSVD == 3 :: ZGESVDQ ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: ZGEJSV ::
+!>       LZWORK_SVD = obtainable by a query
+!>    If on entry LZWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths and returns them in
+!>    LZWORK(1) and LZWORK(2), respectively.
+!>    \endverbatim
+!.....
+!>    \param[out] RWORK
+!>    \verbatim
+!>    RWORK (workspace/output) REAL(KIND=WP) LRWORK-by-1 array
+!>    On exit, RWORK(1:N) contains the singular values of
+!>    X (for JOBS=='N') or column scaled X (JOBS=='S', 'C').
+!>    If WHTSVD==4, then RWORK(N+1) and RWORK(N+2) contain
+!>    scaling factor RWORK(N+2)/RWORK(N+1) used to scale X
+!>    and Y to avoid overflow in the SVD of X.
+!>    This may be of interest if the scaling option is off
+!>    and as many as possible smallest eigenvalues are
+!>    desired to the highest feasible accuracy.
+!>    If the call to ZGEDMD is only workspace query, then
+!>    RWORK(1) contains the minimal workspace length.
+!>    See the description of LRWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LRWORK
+!>    \verbatim
+!>    LRWORK (input) INTEGER
+!>    The minimal length of the workspace vector RWORK.
+!>    LRWORK is calculated as follows:
+!>    LRWORK = MAX(1, N+LRWORK_SVD,N+LRWORK_ZGEEV), where
+!>    LRWORK_ZGEEV = MAX(1,2*N) and RWORK_SVD is the real workspace
+!>    for the SVD subroutine determined by the input parameter
+!>    WHTSVD.
+!>    If WHTSVD == 1 :: ZGESVD ::
+!>       LRWORK_SVD = 5*MIN(M,N)
+!>    If WHTSVD == 2 :: ZGESDD ::
+!>       LRWORK_SVD =  MAX(5*MIN(M,N)*MIN(M,N)+7*MIN(M,N),
+!>       2*MAX(M,N)*MIN(M,N)+2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+!>    If WHTSVD == 3 :: ZGESVDQ ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If WHTSVD == 4 :: ZGEJSV ::
+!>       LRWORK_SVD = obtainable by a query
+!>    If on entry LRWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    real workspace length and returns it in RWORK(1).
+!>    \endverbatim
+!.....
+!>    \param[out] IWORK
+!>    \verbatim
+!>    IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!>    Workspace that is required only if WHTSVD equals
+!>    2 , 3 or 4. (See the description of WHTSVD).
+!>    If on entry LWORK =-1 or LIWORK=-1, then the
+!>    minimal length of IWORK is computed and returned in
+!>    IWORK(1). See the description of LIWORK.
+!>    \endverbatim
+!.....
+!>    \param[in] LIWORK
+!>    \verbatim
+!>    LIWORK (input) INTEGER
+!>    The minimal length of the workspace vector IWORK.
+!>    If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!>    If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M,N))
+!>    If WHTSVD == 3, then LIWORK >= MAX(1,M+N-1)
+!>    If WHTSVD == 4, then LIWORK >= MAX(3,M+3*N)
+!>    If on entry LIWORK = -1, then a workspace query is
+!>    assumed and the procedure only computes the minimal
+!>    and the optimal workspace lengths for  ZWORK, RWORK and
+!>    IWORK. See the descriptions of ZWORK, RWORK and IWORK.
+!>    \endverbatim
+!.....
+!>    \param[out] INFO
+!>    \verbatim
+!>    INFO (output) INTEGER
+!>    -i < 0 :: On entry, the i-th argument had an
+!>              illegal value
+!>       = 0 :: Successful return.
+!>       = 1 :: Void input. Quick exit (M=0 or N=0).
+!>       = 2 :: The SVD computation of X did not converge.
+!>              Suggestion: Check the input data and/or
+!>              repeat with different WHTSVD.
+!>       = 3 :: The computation of the eigenvalues did not
+!>              converge.
+!>       = 4 :: If data scaling was requested on input and
+!>              the procedure found inconsistency in the data
+!>              such that for some column index i,
+!>              X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!>              to zero if JOBS=='C'. The computation proceeds
+!>              with original or modified data and warning
+!>              flag is set with INFO=4.
+!>    \endverbatim
+!
+!  Authors:
+!  ========
+!
+!> \author Zlatko Drmac
+!
+!> \ingroup gedmd
+!
+!.............................................................
+!.............................................................
+      SUBROUTINE ZGEDMD( JOBS, JOBZ, JOBR, JOBF,  WHTSVD,   &
+                         M, N, X, LDX, Y, LDY, NRNK, TOL,   &
+                         K, EIGS, Z, LDZ, RES, B,    LDB,   &
+                         W, LDW,  S, LDS, ZWORK,  LZWORK,   &
+                         RWORK, LRWORK, IWORK, LIWORK, INFO )
+!
+!  -- LAPACK driver routine                                           --
+!
+!  -- LAPACK is a software package provided by University of          --
+!  -- Tennessee, University of California Berkeley, University of     --
+!  -- Colorado Denver and NAG Ltd..                                   --
+!
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!
+!     Scalar arguments
+!     ~~~~~~~~~~~~~~~~
+      CHARACTER, INTENT(IN)   :: JOBS,   JOBZ,  JOBR,  JOBF
+      INTEGER,   INTENT(IN)   :: WHTSVD, M, N,   LDX,  LDY, &
+                                 NRNK, LDZ, LDB, LDW,  LDS, &
+                                 LIWORK, LRWORK, LZWORK
+      INTEGER,       INTENT(OUT)  :: K, INFO
+      REAL(KIND=WP), INTENT(IN)   ::    TOL
+!
+!     Array arguments
+!     ~~~~~~~~~~~~~~~
+      COMPLEX(KIND=WP), INTENT(INOUT) :: X(LDX,*), Y(LDY,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: Z(LDZ,*), B(LDB,*), &
+                                         W(LDW,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP),    INTENT(OUT)   :: RWORK(*)
+      INTEGER,          INTENT(OUT)   :: IWORK(*)
+!
+!     Parameters
+!     ~~~~~~~~~~
+      REAL(KIND=WP),    PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP),    PARAMETER :: ZERO = 0.0_WP
+      COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!
+!     Local scalars
+!     ~~~~~~~~~~~~~
+      REAL(KIND=WP) :: OFL,   ROOTSC, SCALE,  SMALL,    &
+                       SSUM,  XSCL1,  XSCL2
+      INTEGER       ::  i,  j,  IMINWR,  INFO1, INFO2,  &
+                        LWRKEV, LWRSDD, LWRSVD, LWRSVJ, &
+                        LWRSVQ, MLWORK, MWRKEV, MWRSDD, &
+                        MWRSVD, MWRSVJ, MWRSVQ, NUMRNK, &
+                        OLWORK, MLRWRK
+      LOGICAL       ::  BADXY, LQUERY, SCCOLX, SCCOLY,  &
+                        WNTEX, WNTREF, WNTRES, WNTVEC
+      CHARACTER     ::  JOBZL, T_OR_N
+      CHARACTER     ::  JSVOPT
+!
+!     Local arrays
+!     ~~~~~~~~~~~~
+      REAL(KIND=WP) :: RDUMMY(2)
+!
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      REAL(KIND=WP) ZLANGE, DLAMCH, DZNRM2
+      EXTERNAL      ZLANGE, DLAMCH, DZNRM2, IZAMAX
+      INTEGER                               IZAMAX
+      LOGICAL       DISNAN, LSAME
+      EXTERNAL      DISNAN, LSAME
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      ZAXPY,  ZGEMM,  ZDSCAL
+      EXTERNAL      ZGEEV,  ZGEJSV, ZGESDD, ZGESVD, ZGESVDQ, &
+                    ZLACPY, ZLASCL, ZLASSQ, XERBLA
+!
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC     DBLE, INT, MAX, SQRT
+!............................................................
+!
+!    Test the input arguments
+!
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME(JOBS,'C')
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTREF = LSAME(JOBF,'R')
+      WNTEX  = LSAME(JOBF,'E')
+      INFO   = 0
+      LQUERY = ( ( LZWORK == -1 ) .OR. ( LIWORK == -1 ) &
+                                  .OR. ( LRWORK == -1 ) )
+!
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR. &
+                                  LSAME(JOBS,'N')) )   THEN
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. LSAME(JOBZ,'N')        &
+                              .OR. LSAME(JOBZ,'F')) )  THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.  &
+                ( WNTRES .AND. (.NOT.WNTVEC) ) )       THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             &
+                LSAME(JOBF,'N') ) )                    THEN
+          INFO = -4
+      ELSE IF ( .NOT.((WHTSVD == 1) .OR. (WHTSVD == 2) .OR.  &
+                      (WHTSVD == 3) .OR. (WHTSVD == 4) )) THEN
+          INFO = -5
+      ELSE IF ( M < 0 )   THEN
+          INFO = -6
+      ELSE IF ( ( N < 0 ) .OR. ( N > M ) ) THEN
+          INFO = -7
+      ELSE IF ( LDX < M ) THEN
+          INFO = -9
+      ELSE IF ( LDY < M ) THEN
+          INFO = -11
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR. &
+                ((NRNK >= 1).AND.(NRNK <=N ))) )      THEN
+          INFO = -12
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) )  THEN
+          INFO = -13
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -17
+      ELSE IF ( (WNTREF .OR. WNTEX ) .AND. ( LDB < M ) ) THEN
+          INFO = -20
+      ELSE IF ( LDW < N ) THEN
+          INFO = -22
+      ELSE IF ( LDS < N ) THEN
+          INFO = -24
+      END IF
+!
+      IF ( INFO == 0 ) THEN
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and
+          ! determine minimal and optimal sizes of the
+          ! workspace at any moment of the run.
+         IF ( N == 0 ) THEN
+             ! Quick return. All output except K is void.
+             ! INFO=1 signals the void input.
+             ! In case of a workspace query, the default
+             ! minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN
+                IWORK(1) = 1
+                RWORK(1) = 1
+                ZWORK(1) = 2
+                ZWORK(2) = 2
+            ELSE
+               K   =  0
+            END IF
+            INFO = 1
+            RETURN
+         END IF
+
+         IMINWR = 1
+         MLRWRK = MAX(1,N)
+         MLWORK = 2
+         OLWORK = 2
+         SELECT CASE ( WHTSVD )
+         CASE (1)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of ZGESVD:
+             ! MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MWRSVD = MAX(1,2*MIN(M,N)+MAX(M,N))
+             MLWORK = MAX(MLWORK,MWRSVD)
+             MLRWRK = MAX(MLRWRK,N + 5*MIN(M,N))
+             IF ( LQUERY ) THEN
+                CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, &
+                     B, LDB, W, LDW, ZWORK, -1, RDUMMY, INFO1 )
+                LWRSVD = INT( ZWORK(1) )
+                OLWORK = MAX(OLWORK,LWRSVD)
+             END IF
+         CASE (2)
+             ! The following is specified as the minimal
+             ! length of WORK in the definition of ZGESDD:
+             ! MWRSDD = 2*min(M,N)*min(M,N)+2*min(M,N)+max(M,N).
+             ! RWORK length: 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N)
+             ! In LAPACK 3.10.1 RWORK is defined differently.
+             ! Below we take max over the two versions.
+             ! IMINWR = 8*MIN(M,N)
+             MWRSDD = 2*MIN(M,N)*MIN(M,N)+2*MIN(M,N)+MAX(M,N)
+             MLWORK = MAX(MLWORK,MWRSDD)
+             IMINWR = 8*MIN(M,N)
+             MLRWRK = MAX( MLRWRK,  N +                    &
+                      MAX( 5*MIN(M,N)*MIN(M,N)+7*MIN(M,N), &
+                           5*MIN(M,N)*MIN(M,N)+5*MIN(M,N), &
+                           2*MAX(M,N)*MIN(M,N)+            &
+                           2*MIN(M,N)*MIN(M,N)+MIN(M,N) ) )
+             IF ( LQUERY ) THEN
+                CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B,LDB,&
+                     W, LDW, ZWORK, -1, RDUMMY, IWORK, INFO1 )
+                LWRSDD = MAX( MWRSDD,INT( ZWORK(1) ))
+                ! Possible bug in ZGESDD optimal workspace size.
+                OLWORK = MAX(OLWORK,LWRSDD)
+             END IF
+         CASE (3)
+             CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                  X, LDX, RWORK, Z, LDZ, W, LDW, NUMRNK,  &
+                  IWORK, -1, ZWORK, -1, RDUMMY, -1, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVQ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVQ)
+             MLRWRK  = MAX(MLRWRK,N + INT(RDUMMY(1)))
+             IF ( LQUERY ) THEN
+                LWRSVQ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVQ)
+             END IF
+         CASE (4)
+             JSVOPT = 'J'
+             CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW,       &
+                   ZWORK, -1, RDUMMY, -1, IWORK, INFO1 )
+             IMINWR = IWORK(1)
+             MWRSVJ = INT(ZWORK(2))
+             MLWORK = MAX(MLWORK,MWRSVJ)
+             MLRWRK = MAX(MLRWRK,N + MAX(7,INT(RDUMMY(1))))
+             IF ( LQUERY ) THEN
+                LWRSVJ = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,LWRSVJ)
+             END IF
+         END SELECT
+         IF ( WNTVEC .OR. WNTEX .OR. LSAME(JOBZ,'F') ) THEN
+             JOBZL = 'V'
+         ELSE
+             JOBZL = 'N'
+         END IF
+         ! Workspace calculation to the ZGEEV call
+         MWRKEV = MAX( 1, 2*N )
+         MLWORK = MAX(MLWORK,MWRKEV)
+         MLRWRK = MAX(MLRWRK,N+2*N)
+         IF ( LQUERY ) THEN
+             CALL ZGEEV( 'N', JOBZL, N, S, LDS, EIGS, &
+              W, LDW, W, LDW, ZWORK, -1, RWORK, INFO1 )
+                LWRKEV = INT(ZWORK(1))
+                OLWORK = MAX( OLWORK, LWRKEV )
+         END IF
+!
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -30
+         IF ( LRWORK < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -28
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -26
+
+      END IF
+!
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'ZGEDMD', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          RWORK(1) = MLRWRK
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          RETURN
+      END IF
+!............................................................
+!
+      OFL   = DLAMCH('O')
+      SMALL = DLAMCH('S')
+      BADXY = .FALSE.
+!
+!     <1> Optional scaling of the snapshots (columns of X, Y)
+!     ==========================================================
+      IF ( SCCOLX ) THEN
+          ! The columns of X will be normalized.
+          ! To prevent overflows, the column norms of X are
+          ! carefully computed using ZLASSQ.
+          K = 0
+          DO i = 1, N
+            !WORK(i) = DZNRM2( M, X(1,i), 1 )
+            SSUM  = ONE
+            SCALE = ZERO
+            CALL ZLASSQ( M, X(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -8
+                CALL XERBLA('ZGEDMD',-INFO)
+            END IF
+            IF ( (SCALE /= ZERO) .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of X(:,i) overflows. First, X(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||X(:,i)||_2.
+!                 Next, the norm of X(:,i) is stored without
+!                 overflow as RWORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of X(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, X(1,i), LDX, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 X(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               X(1,i), LDX, INFO2 )      ! LAPACK CALL
+!                 X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)   ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+               K = K + 1
+            END IF
+          END DO
+          IF ( K == N ) THEN
+             ! All columns of X are zero. Return error code -8.
+             ! (the 8th input variable had an illegal value)
+             K = 0
+             INFO = -8
+             CALL XERBLA('ZGEDMD',-INFO)
+             RETURN
+          END IF
+          DO i = 1, N
+!           Now, apply the same scaling to the columns of Y.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL ZDSCAL( M, ONE/RWORK(i), Y(1,i), 1 )  ! BLAS CALL
+!               Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)       ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL ZLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/DBLE(M), M, 1, Y(1,i), LDY, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(Y(IZAMAX(M, Y(1,i),1),i ))  &
+                                            /= ZERO ) THEN
+!               X(:,i) is zero vector. For consistency,
+!               Y(:,i) should also be zero. If Y(:,i) is not
+!               zero, then the data might be inconsistent or
+!               corrupted. If JOBS == 'C', Y(:,i) is set to
+!               zero and a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+                IF ( LSAME(JOBS,'C')) &
+                CALL ZDSCAL( M, ZERO, Y(1,i), 1 )  ! BLAS CALL
+            END IF
+          END DO
+      END IF
+  !
+      IF ( SCCOLY ) THEN
+          ! The columns of Y will be normalized.
+          ! To prevent overflows, the column norms of Y are
+          ! carefully computed using ZLASSQ.
+          DO i = 1, N
+            !RWORK(i) = DZNRM2( M, Y(1,i), 1 )
+            SSUM  = ONE
+            SCALE = ZERO
+            CALL ZLASSQ( M, Y(1,i), 1, SCALE, SSUM )
+            IF ( DISNAN(SCALE) .OR. DISNAN(SSUM) ) THEN
+                K    =  0
+                INFO = -10
+                CALL XERBLA('ZGEDMD',-INFO)
+            END IF
+            IF ( SCALE /= ZERO  .AND. (SSUM /= ZERO) ) THEN
+               ROOTSC = SQRT(SSUM)
+               IF ( SCALE .GE. (OFL / ROOTSC) ) THEN
+!                 Norm of Y(:,i) overflows. First, Y(:,i)
+!                 is scaled by
+!                 ( ONE / ROOTSC ) / SCALE = 1/||Y(:,i)||_2.
+!                 Next, the norm of Y(:,i) is stored without
+!                 overflow as RWORK(i) = - SCALE * (ROOTSC/M),
+!                 the minus sign indicating the 1/M factor.
+!                 Scaling is performed without overflow, and
+!                 underflow may occur in the smallest entries
+!                 of Y(:,i). The relative backward and forward
+!                 errors are small in the ell_2 norm.
+                  CALL ZLASCL( 'G', 0, 0, SCALE, ONE/ROOTSC, &
+                               M, 1, Y(1,i), LDY, INFO2 )
+                  RWORK(i) = - SCALE * ( ROOTSC / DBLE(M) )
+               ELSE
+!                 Y(:,i) will be scaled to unit 2-norm
+                  RWORK(i) =   SCALE * ROOTSC
+                  CALL ZLASCL( 'G',0, 0, RWORK(i), ONE, M, 1, &
+                               Y(1,i), LDY, INFO2 )             ! LAPACK CALL
+!                 Y(1:M,i) = (ONE/RWORK(i)) * Y(1:M,i)          ! INTRINSIC
+               END IF
+            ELSE
+               RWORK(i) = ZERO
+            END IF
+         END DO
+         DO i = 1, N
+!           Now, apply the same scaling to the columns of X.
+            IF ( RWORK(i) >  ZERO ) THEN
+                CALL ZDSCAL( M, ONE/RWORK(i), X(1,i), 1 ) ! BLAS CALL
+!               X(1:M,i) = (ONE/RWORK(i)) * X(1:M,i)      ! INTRINSIC
+            ELSE IF ( RWORK(i) < ZERO ) THEN
+                CALL ZLASCL( 'G', 0, 0, -RWORK(i),          &
+                     ONE/DBLE(M), M, 1, X(1,i), LDX, INFO2 ) ! LAPACK CALL
+            ELSE IF ( ABS(X(IZAMAX(M, X(1,i),1),i ))  &
+                                           /= ZERO ) THEN
+!               Y(:,i) is zero vector.  If X(:,i) is not
+!               zero, then a warning flag is raised.
+!               The computation continues but the
+!               situation will be reported in the output.
+                BADXY = .TRUE.
+            END IF
+         END DO
+       END IF
+!
+!     <2> SVD of the data snapshot matrix X.
+!     =====================================
+!     The left singular vectors are stored in the array X.
+!     The right singular vectors are in the array W.
+!     The array W will later on contain the eigenvectors
+!     of a Rayleigh quotient.
+      NUMRNK = N
+      SELECT CASE ( WHTSVD )
+         CASE (1)
+             CALL ZGESVD( 'O', 'S', M, N, X, LDX, RWORK, B, &
+                  LDB, W, LDW, ZWORK, LZWORK,  RWORK(N+1), INFO1 ) ! LAPACK CALL
+             T_OR_N = 'C'
+         CASE (2)
+            CALL ZGESDD( 'O', M, N, X, LDX, RWORK, B, LDB, W, &
+                 LDW, ZWORK, LZWORK, RWORK(N+1), IWORK, INFO1 )   ! LAPACK CALL
+            T_OR_N = 'C'
+         CASE (3)
+              CALL ZGESVDQ( 'H', 'P', 'N', 'R', 'R', M, N, &
+                   X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   NUMRNK, IWORK, LIWORK, ZWORK,     &
+                   LZWORK, RWORK(N+1), LRWORK-N, INFO1)     ! LAPACK CALL
+              CALL ZLACPY( 'A', M, NUMRNK, Z, LDZ, X, LDX )   ! LAPACK CALL
+         T_OR_N = 'C'
+         CASE (4)
+              CALL ZGEJSV( 'F', 'U', JSVOPT, 'R', 'N', 'P', M, &
+                   N, X, LDX, RWORK, Z, LDZ, W, LDW, &
+                   ZWORK, LZWORK, RWORK(N+1), LRWORK-N, IWORK, INFO1 )    ! LAPACK CALL
+              CALL ZLACPY( 'A', M, N, Z, LDZ, X, LDX )   ! LAPACK CALL
+              T_OR_N = 'N'
+              XSCL1 = RWORK(N+1)
+              XSCL2 = RWORK(N+2)
+              IF ( XSCL1 /=  XSCL2 ) THEN
+                 ! This is an exceptional situation. If the
+                 ! data matrices are not scaled and the
+                 ! largest singular value of X overflows.
+                 ! In that case ZGEJSV can return the SVD
+                 ! in scaled form. The scaling factor can be used
+                 ! to rescale the data (X and Y).
+                 CALL ZLASCL( 'G', 0, 0, XSCL1, XSCL2, M, N, Y, LDY, INFO2  )
+              END IF
+      END SELECT
+!
+      IF ( INFO1 > 0 ) THEN
+         ! The SVD selected subroutine did not converge.
+         ! Return with an error code.
+         INFO = 2
+         RETURN
+      END IF
+!
+      IF ( RWORK(1) == ZERO ) THEN
+          ! The largest computed singular value of (scaled)
+          ! X is zero. Return error code -8
+          ! (the 8th input variable had an illegal value).
+          K = 0
+          INFO = -8
+          CALL XERBLA('ZGEDMD',-INFO)
+          RETURN
+      END IF
+!
+      !<3> Determine the numerical rank of the data
+      !    snapshots matrix X. This depends on the
+      !    parameters NRNK and TOL.
+
+      SELECT CASE ( NRNK )
+          CASE ( -1 )
+               K = 1
+               DO i = 2, NUMRNK
+                 IF ( ( RWORK(i) <= RWORK(1)*TOL ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE ( -2 )
+               K = 1
+               DO i = 1, NUMRNK-1
+                 IF ( ( RWORK(i+1) <= RWORK(i)*TOL  ) .OR. &
+                      ( RWORK(i) <= SMALL ) ) EXIT
+                 K = K + 1
+               END DO
+          CASE DEFAULT
+               K = 1
+               DO i = 2, NRNK
+                  IF ( RWORK(i) <= SMALL ) EXIT
+                  K = K + 1
+               END DO
+          END SELECT
+      !   Now, U = X(1:M,1:K) is the SVD/POD basis for the
+      !   snapshot data in the input matrix X.
+
+      !<4> Compute the Rayleigh quotient S = U^H * A * U.
+      !    Depending on the requested outputs, the computation
+      !    is organized to compute additional auxiliary
+      !    matrices (for the residuals and refinements).
+      !
+      !    In all formulas below, we need V_k*Sigma_k^(-1)
+      !    where either V_k is in W(1:N,1:K), or V_k^H is in
+      !    W(1:K,1:N). Here Sigma_k=diag(WORK(1:K)).
+      IF ( LSAME(T_OR_N, 'N') ) THEN
+          DO i = 1, K
+           CALL ZDSCAL( N, ONE/RWORK(i), W(1,i), 1 )    ! BLAS CALL
+           ! W(1:N,i) = (ONE/RWORK(i)) * W(1:N,i)      ! INTRINSIC
+          END DO
+      ELSE
+          ! This non-unit stride access is due to the fact
+          ! that ZGESVD, ZGESVDQ and ZGESDD return the
+          ! adjoint matrix of the right singular vectors.
+          !DO i = 1, K
+          ! CALL ZDSCAL( N, ONE/RWORK(i), W(i,1), LDW )    ! BLAS CALL
+          ! ! W(i,1:N) = (ONE/RWORK(i)) * W(i,1:N)      ! INTRINSIC
+          !END DO
+          DO i = 1, K
+              RWORK(N+i) = ONE/RWORK(i)
+          END DO
+          DO j = 1, N
+             DO i = 1, K
+                 W(i,j) = CMPLX(RWORK(N+i),ZERO,KIND=WP)*W(i,j)
+             END DO
+          END DO
+      END IF
+!
+      IF ( WNTREF ) THEN
+         !
+         ! Need A*U(:,1:K)=Y*V_k*inv(diag(WORK(1:K)))
+         ! for computing the refined Ritz vectors
+         ! (optionally, outside ZGEDMD).
+          CALL ZGEMM( 'N', T_OR_N, M, K, N, ZONE, Y, LDY, W, &
+                      LDW, ZZERO, Z, LDZ )                       ! BLAS CALL
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='C'
+          ! Z(1:M,1:K)=MATMUL(Y(1:M,1:N),W(1:N,1:K))                   ! INTRINSIC, for T_OR_N=='N'
+          !
+          ! At this point Z contains
+          ! A * U(:,1:K) = Y * V_k * Sigma_k^(-1), and
+          ! this is needed for computing the residuals.
+          ! This matrix is  returned in the array B and
+          ! it can be used to compute refined Ritz vectors.
+          CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB )   ! BLAS CALL
+          ! B(1:M,1:K) = Z(1:M,1:K)                  ! INTRINSIC
+
+          CALL ZGEMM( 'C', 'N', K, K, M, ZONE, X, LDX, Z, &
+                      LDZ, ZZERO, S, LDS )                        ! BLAS CALL
+          ! S(1:K,1:K) = MATMUL(TRANSPOSE(CONJG(X(1:M,1:K))),Z(1:M,1:K)) ! INTRINSIC
+          ! At this point S = U^H * A * U is the Rayleigh quotient.
+      ELSE
+        ! A * U(:,1:K) is not explicitly needed and the
+        ! computation is organized differently. The Rayleigh
+        ! quotient is computed more efficiently.
+        CALL ZGEMM( 'C', 'N', K, N, M, ZONE, X, LDX, Y, LDY, &
+                   ZZERO, Z, LDZ )                                         ! BLAS CALL
+        ! Z(1:K,1:N) = MATMUL( TRANSPOSE(CONJG(X(1:M,1:K))), Y(1:M,1:N) )  ! INTRINSIC
+        !
+        CALL ZGEMM( 'N', T_OR_N, K, K, N, ZONE, Z, LDZ, W, &
+                    LDW, ZZERO, S, LDS )                         ! BLAS CALL
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),TRANSPOSE(CONJG(W(1:K,1:N)))) ! INTRINSIC, for T_OR_N=='T'
+        ! S(1:K,1:K) = MATMUL(Z(1:K,1:N),(W(1:N,1:K)))                 ! INTRINSIC, for T_OR_N=='N'
+        ! At this point S = U^H * A * U is the Rayleigh quotient.
+        ! If the residuals are requested, save scaled V_k into Z.
+        ! Recall that V_k or V_k^H is stored in W.
+        IF ( WNTRES .OR. WNTEX ) THEN
+          IF ( LSAME(T_OR_N, 'N') ) THEN
+              CALL ZLACPY( 'A', N, K, W, LDW, Z, LDZ )
+          ELSE
+              CALL ZLACPY( 'A', K, N, W, LDW, Z, LDZ )
+          END IF
+        END IF
+      END IF
+!
+      !<5> Compute the Ritz values and (if requested) the
+      !   right eigenvectors of the Rayleigh quotient.
+      !
+      CALL ZGEEV( 'N', JOBZL, K, S, LDS, EIGS, W, LDW, &
+            W, LDW, ZWORK, LZWORK, RWORK(N+1), INFO1 )  ! LAPACK CALL
+      !
+      ! W(1:K,1:K) contains the eigenvectors of the Rayleigh
+      ! quotient.  See the description of Z.
+      ! Also, see the description of ZGEEV.
+      IF ( INFO1 > 0 ) THEN
+         ! ZGEEV failed to compute the eigenvalues and
+         ! eigenvectors of the Rayleigh quotient.
+         INFO = 3
+         RETURN
+      END IF
+!
+      ! <6> Compute the eigenvectors (if requested) and,
+      ! the residuals (if requested).
+      !
+      IF ( WNTVEC .OR. WNTEX ) THEN
+      IF ( WNTRES ) THEN
+          IF ( WNTREF ) THEN
+            ! Here, if the refinement is requested, we have
+            ! A*U(:,1:K) already computed and stored in Z.
+            ! For the residuals, need Y = A * U(:,1;K) * W.
+            CALL ZGEMM( 'N', 'N', M, K, K, ZONE, Z, LDZ, W, &
+                       LDW, ZZERO, Y, LDY )               ! BLAS CALL
+            ! Y(1:M,1:K) = Z(1:M,1:K) * W(1:K,1:K)        ! INTRINSIC
+            ! This frees Z; Y contains A * U(:,1:K) * W.
+          ELSE
+            ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) (or its adjoint) is stored in Z
+            CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                       W, LDW, ZZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                       LDS, ZZERO, Z, LDZ )
+            ! Save a copy of Z into Y and free Z for holding
+            ! the Ritz vectors.
+            CALL ZLACPY( 'A', M, K, Z, LDZ, Y, LDY )
+            IF ( WNTEX ) CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB )
+          END IF
+      ELSE IF ( WNTEX ) THEN
+          ! Compute S = V_k * Sigma_k^(-1) * W, where
+            ! V_k * Sigma_k^(-1) is stored in Z
+            CALL ZGEMM( T_OR_N, 'N', N, K, K, ZONE, Z, LDZ, &
+                       W, LDW, ZZERO, S, LDS )
+            ! Then, compute Z = Y * S =
+            ! = Y * V_k * Sigma_k^(-1) * W(1:K,1:K) =
+            ! = A * U(:,1:K) * W(1:K,1:K)
+            CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+                       LDS, ZZERO, B, LDB )
+            ! The above call replaces the following two calls
+            ! that were used in the developing-testing phase.
+            ! CALL ZGEMM( 'N', 'N', M, K, N, ZONE, Y, LDY, S, &
+            !           LDS, ZZERO, Z, LDZ)
+            ! Save a copy of Z into B and free Z for holding
+            ! the Ritz vectors.
+            ! CALL ZLACPY( 'A', M, K, Z, LDZ, B, LDB )
+      END IF
+!
+      ! Compute the Ritz vectors
+      IF ( WNTVEC ) CALL ZGEMM( 'N', 'N', M, K, K, ZONE, X, LDX, W, LDW, &
+                   ZZERO, Z, LDZ )                          ! BLAS CALL
+      ! Z(1:M,1:K) = MATMUL(X(1:M,1:K), W(1:K,1:K))         ! INTRINSIC
+!
+      IF ( WNTRES ) THEN
+         DO i = 1, K
+            CALL ZAXPY( M, -EIGS(i), Z(1,i), 1, Y(1,i), 1 )       ! BLAS CALL
+            ! Y(1:M,i) = Y(1:M,i) - EIGS(i) * Z(1:M,i)            ! INTRINSIC
+            RES(i) = DZNRM2( M, Y(1,i), 1 )                       ! BLAS CALL
+         END DO
+      END IF
+      END IF
+!
+      IF ( WHTSVD == 4 ) THEN
+          RWORK(N+1) = XSCL1
+          RWORK(N+2) = XSCL2
+      END IF
+!
+!     Successful exit.
+      IF ( .NOT. BADXY ) THEN
+         INFO = 0
+      ELSE
+         ! A warning on possible data inconsistency.
+         ! This should be a rare event.
+         INFO = 4
+      END IF
+!............................................................
+      RETURN
+!     ......
+      END SUBROUTINE ZGEDMD
+
diff --git a/lapack-netlib/SRC/zgedmdq.c b/lapack-netlib/SRC/zgedmdq.c
new file mode 100644
index 000000000..1815f0814
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmdq.c
@@ -0,0 +1,1293 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c_n1 = -1;
+
+/* Subroutine */ int zgedmdq_(char *jobs, char *jobz, char *jobr, char *jobq, 
+	char *jobt, char *jobf, integer *whtsvd, integer *m, integer *n, 
+	doublecomplex *f, integer *ldf, doublecomplex *x, integer *ldx, 
+	doublecomplex *y, integer *ldy, integer *nrnk, doublereal *tol, 
+	integer *k, doublecomplex *eigs, doublecomplex *z__, integer *ldz, 
+	doublereal *res, doublecomplex *b, integer *ldb, doublecomplex *v, 
+	integer *ldv, doublecomplex *s, integer *lds, doublecomplex *zwork, 
+	integer *lzwork, doublereal *work, integer *lwork, integer *iwork, 
+	integer *liwork, integer *info)
+{
+    /* System generated locals */
+    integer f_dim1, f_offset, x_dim1, x_offset, y_dim1, y_offset, z_dim1, 
+	    z_offset, b_dim1, b_offset, v_dim1, v_offset, s_dim1, s_offset, 
+	    i__1, i__2;
+
+    /* Local variables */
+    doublereal zero;
+    integer info1;
+    extern logical lsame_(char *, char *);
+    char jobvl[1];
+    integer minmn;
+    logical wantq;
+    integer mlwqr, olwqr;
+    logical wntex;
+    doublecomplex zzero;
+    extern /* Subroutine */ int zgedmd_(char *, char *, char *, char *, 
+	    integer *, integer *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, integer *, doublereal *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *, doublereal *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, 
+	    doublereal *, integer *, integer *, integer *, integer *), xerbla_(char *, integer *);
+    integer mlwdmd, olwdmd;
+    logical sccolx, sccoly;
+    extern /* Subroutine */ int zgeqrf_(integer *, integer *, doublecomplex *,
+	     integer *, doublecomplex *, doublecomplex *, integer *, integer *
+	    ), zlacpy_(char *, integer *, integer *, doublecomplex *, integer 
+	    *, doublecomplex *, integer *), zlaset_(char *, integer *,
+	     integer *, doublecomplex *, doublecomplex *, doublecomplex *, 
+	    integer *);
+    integer iminwr;
+    logical wntvec, wntvcf;
+    integer mlwgqr;
+    logical wntref;
+    integer mlwork, olwgqr, olwork, mlrwrk, mlwmqr, olwmqr;
+    logical lquery, wntres, wnttrf, wntvcq;
+    extern /* Subroutine */ int zungqr_(integer *, integer *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, integer *), zunmqr_(char *, char *, integer *, integer 
+	    *, integer *, doublecomplex *, integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *, integer *, integer *);
+    doublereal one;
+
+/* March 2023 */
+/* ..... */
+/*      USE                   iso_fortran_env */
+/*      INTEGER, PARAMETER :: WP = real64 */
+/* ..... */
+/*     Scalar arguments */
+/*     Array arguments */
+/* ..... */
+/*     Purpose */
+/*     ======= */
+/*     ZGEDMDQ computes the Dynamic Mode Decomposition (DMD) for */
+/*     a pair of data snapshot matrices, using a QR factorization */
+/*     based compression of the data. For the input matrices */
+/*     X and Y such that Y = A*X with an unaccessible matrix */
+/*     A, ZGEDMDQ computes a certain number of Ritz pairs of A using */
+/*     the standard Rayleigh-Ritz extraction from a subspace of */
+/*     range(X) that is determined using the leading left singular */
+/*     vectors of X. Optionally, ZGEDMDQ returns the residuals */
+/*     of the computed Ritz pairs, the information needed for */
+/*     a refinement of the Ritz vectors, or the eigenvectors of */
+/*     the Exact DMD. */
+/*     For further details see the references listed */
+/*     below. For more details of the implementation see [3]. */
+
+/*     References */
+/*     ========== */
+/*     [1] P. Schmid: Dynamic mode decomposition of numerical */
+/*         and experimental data, */
+/*         Journal of Fluid Mechanics 656, 5-28, 2010. */
+/*     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal */
+/*         decompositions: analysis and enhancements, */
+/*         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018. */
+/*     [3] Z. Drmac: A LAPACK implementation of the Dynamic */
+/*         Mode Decomposition I. Technical report. AIMDyn Inc. */
+/*         and LAPACK Working Note 298. */
+/*     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. */
+/*         Brunton, N. Kutz: On Dynamic Mode Decomposition: */
+/*         Theory and Applications, Journal of Computational */
+/*         Dynamics 1(2), 391 -421, 2014. */
+
+/*     Developed and supported by: */
+/*     =========================== */
+/*     Developed and coded by Zlatko Drmac, Faculty of Science, */
+/*     University of Zagreb;  drmac@math.hr */
+/*     In cooperation with */
+/*     AIMdyn Inc., Santa Barbara, CA. */
+/*     and supported by */
+/*     - DARPA SBIR project "Koopman Operator-Based Forecasting */
+/*     for Nonstationary Processes from Near-Term, Limited */
+/*     Observational Data" Contract No: W31P4Q-21-C-0007 */
+/*     - DARPA PAI project "Physics-Informed Machine Learning */
+/*     Methodologies" Contract No: HR0011-18-9-0033 */
+/*     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic */
+/*     Framework for Space-Time Analysis of Process Dynamics" */
+/*     Contract No: HR0011-16-C-0116 */
+/*     Any opinions, findings and conclusions or recommendations */
+/*     expressed in this material are those of the author and */
+/*     do not necessarily reflect the views of the DARPA SBIR */
+/*     Program Office. */
+/* ============================================================ */
+/*     Distribution Statement A: */
+/*     Approved for Public Release, Distribution Unlimited. */
+/*     Cleared by DARPA on September 29, 2022 */
+/* ============================================================ */
+/* ...................................................................... */
+/*     Arguments */
+/*     ========= */
+/*     JOBS (input) CHARACTER*1 */
+/*     Determines whether the initial data snapshots are scaled */
+/*     by a diagonal matrix. The data snapshots are the columns */
+/*     of F. The leading N-1 columns of F are denoted X and the */
+/*     trailing N-1 columns are denoted Y. */
+/*     'S' :: The data snapshots matrices X and Y are multiplied */
+/*            with a diagonal matrix D so that X*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'C' :: The snapshots are scaled as with the 'S' option. */
+/*            If it is found that an i-th column of X is zero */
+/*            vector and the corresponding i-th column of Y is */
+/*            non-zero, then the i-th column of Y is set to */
+/*            zero and a warning flag is raised. */
+/*     'Y' :: The data snapshots matrices X and Y are multiplied */
+/*            by a diagonal matrix D so that Y*D has unit */
+/*            nonzero columns (in the Euclidean 2-norm) */
+/*     'N' :: No data scaling. */
+/* ..... */
+/*     JOBZ (input) CHARACTER*1 */
+/*     Determines whether the eigenvectors (Koopman modes) will */
+/*     be computed. */
+/*     'V' :: The eigenvectors (Koopman modes) will be computed */
+/*            and returned in the matrix Z. */
+/*            See the description of Z. */
+/*     'F' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Z*V, where Z */
+/*            is orthonormal and V contains the eigenvectors */
+/*            of the corresponding Rayleigh quotient. */
+/*            See the descriptions of F, V, Z. */
+/*     'Q' :: The eigenvectors (Koopman modes) will be returned */
+/*            in factored form as the product Q*Z, where Z */
+/*            contains the eigenvectors of the compression of the */
+/*            underlying discretized operator onto the span of */
+/*            the data snapshots. See the descriptions of F, V, Z. */
+/*            Q is from the initial QR factorization. */
+/*     'N' :: The eigenvectors are not computed. */
+/* ..... */
+/*     JOBR (input) CHARACTER*1 */
+/*     Determines whether to compute the residuals. */
+/*     'R' :: The residuals for the computed eigenpairs will */
+/*            be computed and stored in the array RES. */
+/*            See the description of RES. */
+/*            For this option to be legal, JOBZ must be 'V'. */
+/*     'N' :: The residuals are not computed. */
+/* ..... */
+/*     JOBQ (input) CHARACTER*1 */
+/*     Specifies whether to explicitly compute and return the */
+/*     unitary matrix from the QR factorization. */
+/*     'Q' :: The matrix Q of the QR factorization of the data */
+/*            snapshot matrix is computed and stored in the */
+/*            array F. See the description of F. */
+/*     'N' :: The matrix Q is not explicitly computed. */
+/* ..... */
+/*     JOBT (input) CHARACTER*1 */
+/*     Specifies whether to return the upper triangular factor */
+/*     from the QR factorization. */
+/*     'R' :: The matrix R of the QR factorization of the data */
+/*            snapshot matrix F is returned in the array Y. */
+/*            See the description of Y and Further details. */
+/*     'N' :: The matrix R is not returned. */
+/* ..... */
+/*     JOBF (input) CHARACTER*1 */
+/*     Specifies whether to store information needed for post- */
+/*     processing (e.g. computing refined Ritz vectors) */
+/*     'R' :: The matrix needed for the refinement of the Ritz */
+/*            vectors is computed and stored in the array B. */
+/*            See the description of B. */
+/*     'E' :: The unscaled eigenvectors of the Exact DMD are */
+/*            computed and returned in the array B. See the */
+/*            description of B. */
+/*     'N' :: No eigenvector refinement data is computed. */
+/*     To be useful on exit, this option needs JOBQ='Q'. */
+/* ..... */
+/*     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 } */
+/*     Allows for a selection of the SVD algorithm from the */
+/*     LAPACK library. */
+/*     1 :: ZGESVD (the QR SVD algorithm) */
+/*     2 :: ZGESDD (the Divide and Conquer algorithm; if enough */
+/*          workspace available, this is the fastest option) */
+/*     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4 */
+/*          are the most accurate options) */
+/*     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3 */
+/*          are the most accurate options) */
+/*     For the four methods above, a significant difference in */
+/*     the accuracy of small singular values is possible if */
+/*     the snapshots vary in norm so that X is severely */
+/*     ill-conditioned. If small (smaller than EPS*||X||) */
+/*     singular values are of interest and JOBS=='N',  then */
+/*     the options (3, 4) give the most accurate results, where */
+/*     the option 4 is slightly better and with stronger */
+/*     theoretical background. */
+/*     If JOBS=='S', i.e. the columns of X will be normalized, */
+/*     then all methods give nearly equally accurate results. */
+/* ..... */
+/*     M (input) INTEGER, M >= 0 */
+/*     The state space dimension (the number of rows of F). */
+/* ..... */
+/*     N (input) INTEGER, 0 <= N <= M */
+/*     The number of data snapshots from a single trajectory, */
+/*     taken at equidistant discrete times. This is the */
+/*     number of columns of F. */
+/* ..... */
+/*     F (input/output) COMPLEX(KIND=WP) M-by-N array */
+/*     > On entry, */
+/*     the columns of F are the sequence of data snapshots */
+/*     from a single trajectory, taken at equidistant discrete */
+/*     times. It is assumed that the column norms of F are */
+/*     in the range of the normalized floating point numbers. */
+/*     < On exit, */
+/*     If JOBQ == 'Q', the array F contains the orthogonal */
+/*     matrix/factor of the QR factorization of the initial */
+/*     data snapshots matrix F. See the description of JOBQ. */
+/*     If JOBQ == 'N', the entries in F strictly below the main */
+/*     diagonal contain, column-wise, the information on the */
+/*     Householder vectors, as returned by ZGEQRF. The */
+/*     remaining information to restore the orthogonal matrix */
+/*     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). */
+/*     See the description of ZWORK. */
+/* ..... */
+/*     LDF (input) INTEGER, LDF >= M */
+/*     The leading dimension of the array F. */
+/* ..... */
+/*     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array */
+/*     X is used as workspace to hold representations of the */
+/*     leading N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, the leading K columns of X contain the leading */
+/*     K left singular vectors of the above described content */
+/*     of X. To lift them to the space of the left singular */
+/*     vectors U(:,1:K) of the input data, pre-multiply with the */
+/*     Q factor from the initial QR factorization. */
+/*     See the descriptions of F, K, V  and Z. */
+/* ..... */
+/*     LDX (input) INTEGER, LDX >= N */
+/*     The leading dimension of the array X. */
+/* ..... */
+/*     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array */
+/*     Y is used as workspace to hold representations of the */
+/*     trailing N-1 snapshots in the orthonormal basis computed */
+/*     in the QR factorization of F. */
+/*     On exit, */
+/*     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper */
+/*     triangular factor from the QR factorization of the data */
+/*     snapshot matrix F. */
+/* ..... */
+/*     LDY (input) INTEGER , LDY >= N */
+/*     The leading dimension of the array Y. */
+/* ..... */
+/*     NRNK (input) INTEGER */
+/*     Determines the mode how to compute the numerical rank, */
+/*     i.e. how to truncate small singular values of the input */
+/*     matrix X. On input, if */
+/*     NRNK = -1 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(1) */
+/*                  This option is recommended. */
+/*     NRNK = -2 :: i-th singular value sigma(i) is truncated */
+/*                  if sigma(i) <= TOL*sigma(i-1) */
+/*                  This option is included for R&D purposes. */
+/*                  It requires highly accurate SVD, which */
+/*                  may not be feasible. */
+/*     The numerical rank can be enforced by using positive */
+/*     value of NRNK as follows: */
+/*     0 < NRNK <= N-1 :: at most NRNK largest singular values */
+/*     will be used. If the number of the computed nonzero */
+/*     singular values is less than NRNK, then only those */
+/*     nonzero values will be used and the actually used */
+/*     dimension is less than NRNK. The actual number of */
+/*     the nonzero singular values is returned in the variable */
+/*     K. See the description of K. */
+/* ..... */
+/*     TOL (input) REAL(KIND=WP), 0 <= TOL < 1 */
+/*     The tolerance for truncating small singular values. */
+/*     See the description of NRNK. */
+/* ..... */
+/*     K (output) INTEGER,  0 <= K <= N */
+/*     The dimension of the SVD/POD basis for the leading N-1 */
+/*     data snapshots (columns of F) and the number of the */
+/*     computed Ritz pairs. The value of K is determined */
+/*     according to the rule set by the parameters NRNK and */
+/*     TOL. See the descriptions of NRNK and TOL. */
+/* ..... */
+/*     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array */
+/*     The leading K (K<=N-1) entries of EIGS contain */
+/*     the computed eigenvalues (Ritz values). */
+/*     See the descriptions of K, and Z. */
+/* ..... */
+/*     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array */
+/*     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i) */
+/*     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1. */
+/*     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as */
+/*     Z*V, where Z contains orthonormal matrix (the product of */
+/*     Q from the initial QR factorization and the SVD/POD_basis */
+/*     returned by ZGEDMD in X) and the second factor (the */
+/*     eigenvectors of the Rayleigh quotient) is in the array V, */
+/*     as returned by ZGEDMD. That is,  X(:,1:K)*V(:,i) */
+/*     is an eigenvector corresponding to EIGS(i). The columns */
+/*     of V(1:K,1:K) are the computed eigenvectors of the */
+/*     K-by-K Rayleigh quotient. */
+/*     See the descriptions of EIGS, X and V. */
+/* ..... */
+/*     LDZ (input) INTEGER , LDZ >= M */
+/*     The leading dimension of the array Z. */
+/* ..... */
+/*     RES (output) REAL(KIND=WP) (N-1)-by-1 array */
+/*     RES(1:K) contains the residuals for the K computed */
+/*     Ritz pairs, */
+/*     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2. */
+/*     See the description of EIGS and Z. */
+/* ..... */
+/*     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array. */
+/*     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can */
+/*     be used for computing the refined vectors; see further */
+/*     details in the provided references. */
+/*     If JOBF == 'E', B(1:N,1;K) contains */
+/*     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the */
+/*     Exact DMD, up to scaling by the inverse eigenvalues. */
+/*     In both cases, the content of B can be lifted to the */
+/*     original dimension of the input data by pre-multiplying */
+/*     with the Q factor from the initial QR factorization. */
+/*     Here A denotes a compression of the underlying operator. */
+/*     See the descriptions of F and X. */
+/*     If JOBF =='N', then B is not referenced. */
+/* ..... */
+/*     LDB (input) INTEGER, LDB >= MIN(M,N) */
+/*     The leading dimension of the array B. */
+/* ..... */
+/*     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     On exit, V(1:K,1:K) V contains the K eigenvectors of */
+/*     the Rayleigh quotient. The Ritz vectors */
+/*     (returned in Z) are the product of Q from the initial QR */
+/*     factorization (see the description of F) X (see the */
+/*     description of X) and V. */
+/* ..... */
+/*     LDV (input) INTEGER, LDV >= N-1 */
+/*     The leading dimension of the array V. */
+/* ..... */
+/*     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array */
+/*     The array S(1:K,1:K) is used for the matrix Rayleigh */
+/*     quotient. This content is overwritten during */
+/*     the eigenvalue decomposition by ZGEEV. */
+/*     See the description of K. */
+/* ..... */
+/*     LDS (input) INTEGER, LDS >= N-1 */
+/*     The leading dimension of the array S. */
+/* ..... */
+/*     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     ZWORK(1:MIN(M,N)) contains the scalar factors of the */
+/*     elementary reflectors as returned by ZGEQRF of the */
+/*     M-by-N input matrix F. */
+/*     If the call to ZGEDMDQ is only workspace query, then */
+/*     ZWORK(1) contains the minimal complex workspace length and */
+/*     ZWORK(2) is the optimal complex workspace length. */
+/*     Hence, the length of work is at least 2. */
+/*     See the description of LZWORK. */
+/* ..... */
+/*     LZWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector ZWORK. */
+/*     LZWORK is calculated as follows: */
+/*     Let MLWQR  = N (minimal workspace for ZGEQRF[M,N]) */
+/*         MLWDMD = minimal workspace for ZGEDMD (see the */
+/*                  description of LWORK in ZGEDMD) */
+/*         MLWMQR = N (minimal workspace for */
+/*                    ZUNMQR['L','N',M,N,N]) */
+/*         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N]) */
+/*         MINMN  = MIN(M,N) */
+/*     Then */
+/*     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD) */
+/*     is further updated as follows: */
+/*        if   JOBZ == 'V' or JOBZ == 'F' THEN */
+/*             LZWORK = MAX(LZWORK, MINMN+MLWMQR) */
+/*        if   JOBQ == 'Q' THEN */
+/*             LZWORK = MAX(ZLWORK, MINMN+MLWGQR) */
+
+/* ..... */
+/*     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array */
+/*     On exit, */
+/*     WORK(1:N-1) contains the singular values of */
+/*     the input submatrix F(1:M,1:N-1). */
+/*     If the call to ZGEDMDQ is only workspace query, then */
+/*     WORK(1) contains the minimal workspace length and */
+/*     WORK(2) is the optimal workspace length. hence, the */
+/*     length of work is at least 2. */
+/*     See the description of LWORK. */
+/* ..... */
+/*     LWORK (input) INTEGER */
+/*     The minimal length of the  workspace vector WORK. */
+/*     LWORK is the same as in ZGEDMD, because in ZGEDMDQ */
+/*     only ZGEDMD requires real workspace for snapshots */
+/*     of dimensions MIN(M,N)-by-(N-1). */
+/*     If on entry LWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace length for WORK. */
+/* ..... */
+/*     IWORK (workspace/output) INTEGER LIWORK-by-1 array */
+/*     Workspace that is required only if WHTSVD equals */
+/*     2 , 3 or 4. (See the description of WHTSVD). */
+/*     If on entry LWORK =-1 or LIWORK=-1, then the */
+/*     minimal length of IWORK is computed and returned in */
+/*     IWORK(1). See the description of LIWORK. */
+/* ..... */
+/*     LIWORK (input) INTEGER */
+/*     The minimal length of the workspace vector IWORK. */
+/*     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1 */
+/*     Let M1=MIN(M,N), N1=N-1. Then */
+/*     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1)) */
+/*     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1) */
+/*     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1) */
+/*     If on entry LIWORK = -1, then a workspace query is */
+/*     assumed and the procedure only computes the minimal */
+/*     and the optimal workspace lengths for both WORK and */
+/*     IWORK. See the descriptions of WORK and IWORK. */
+/* ..... */
+/*     INFO (output) INTEGER */
+/*     -i < 0 :: On entry, the i-th argument had an */
+/*               illegal value */
+/*        = 0 :: Successful return. */
+/*        = 1 :: Void input. Quick exit (M=0 or N=0). */
+/*        = 2 :: The SVD computation of X did not converge. */
+/*               Suggestion: Check the input data and/or */
+/*               repeat with different WHTSVD. */
+/*        = 3 :: The computation of the eigenvalues did not */
+/*               converge. */
+/*        = 4 :: If data scaling was requested on input and */
+/*               the procedure found inconsistency in the data */
+/*               such that for some column index i, */
+/*               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set */
+/*               to zero if JOBS=='C'. The computation proceeds */
+/*               with original or modified data and warning */
+/*               flag is set with INFO=4. */
+/* ............................................................. */
+/* ............................................................. */
+/*     Parameters */
+/*     ~~~~~~~~~~ */
+/*     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP ) */
+
+/*     Local scalars */
+/*     ~~~~~~~~~~~~~ */
+
+/*     External functions (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~ */
+
+/*     External subroutines (BLAS and LAPACK) */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     External subroutines */
+/*     ~~~~~~~~~~~~~~~~~~~~ */
+/*     Intrinsic functions */
+/*     ~~~~~~~~~~~~~~~~~~~ */
+/* .......................................................... */
+    /* Parameter adjustments */
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    x_dim1 = *ldx;
+    x_offset = 1 + x_dim1 * 1;
+    x -= x_offset;
+    y_dim1 = *ldy;
+    y_offset = 1 + y_dim1 * 1;
+    y -= y_offset;
+    --eigs;
+    z_dim1 = *ldz;
+    z_offset = 1 + z_dim1 * 1;
+    z__ -= z_offset;
+    --res;
+    b_dim1 = *ldb;
+    b_offset = 1 + b_dim1 * 1;
+    b -= b_offset;
+    v_dim1 = *ldv;
+    v_offset = 1 + v_dim1 * 1;
+    v -= v_offset;
+    s_dim1 = *lds;
+    s_offset = 1 + s_dim1 * 1;
+    s -= s_offset;
+    --zwork;
+    --work;
+    --iwork;
+
+    /* Function Body */
+    one = 1.f;
+    zero = 0.f;
+    zzero.r = 0.f, zzero.i = 0.f;
+
+/*    Test the input arguments */
+    wntres = lsame_(jobr, "R");
+    sccolx = lsame_(jobs, "S") || lsame_(jobs, "C");
+    sccoly = lsame_(jobs, "Y");
+    wntvec = lsame_(jobz, "V");
+    wntvcf = lsame_(jobz, "F");
+    wntvcq = lsame_(jobz, "Q");
+    wntref = lsame_(jobf, "R");
+    wntex = lsame_(jobf, "E");
+    wantq = lsame_(jobq, "Q");
+    wnttrf = lsame_(jobt, "R");
+    minmn = f2cmin(*m,*n);
+    *info = 0;
+    lquery = *lzwork == -1 || *lwork == -1 || *liwork == -1;
+
+    if (! (sccolx || sccoly || lsame_(jobs, "N"))) {
+	*info = -1;
+    } else if (! (wntvec || wntvcf || wntvcq || lsame_(jobz, "N"))) {
+	*info = -2;
+    } else if (! (wntres || lsame_(jobr, "N")) || 
+	    wntres && lsame_(jobz, "N")) {
+	*info = -3;
+    } else if (! (wantq || lsame_(jobq, "N"))) {
+	*info = -4;
+    } else if (! (wnttrf || lsame_(jobt, "N"))) {
+	*info = -5;
+    } else if (! (wntref || wntex || lsame_(jobf, "N")))
+	     {
+	*info = -6;
+    } else if (! (*whtsvd == 1 || *whtsvd == 2 || *whtsvd == 3 || *whtsvd == 
+	    4)) {
+	*info = -7;
+    } else if (*m < 0) {
+	*info = -8;
+    } else if (*n < 0 || *n > *m + 1) {
+	*info = -9;
+    } else if (*ldf < *m) {
+	*info = -11;
+    } else if (*ldx < minmn) {
+	*info = -13;
+    } else if (*ldy < minmn) {
+	*info = -15;
+    } else if (! (*nrnk == -2 || *nrnk == -1 || *nrnk >= 1 && *nrnk <= *n)) {
+	*info = -16;
+    } else if (*tol < zero || *tol >= one) {
+	*info = -17;
+    } else if (*ldz < *m) {
+	*info = -21;
+    } else if ((wntref || wntex) && *ldb < minmn) {
+	*info = -24;
+    } else if (*ldv < *n - 1) {
+	*info = -26;
+    } else if (*lds < *n - 1) {
+	*info = -28;
+    }
+
+    if (wntvec || wntvcf || wntvcq) {
+	*(unsigned char *)jobvl = 'V';
+    } else {
+	*(unsigned char *)jobvl = 'N';
+    }
+    if (*info == 0) {
+/* Compute the minimal and the optimal workspace */
+/* requirements. Simulate running the code and */
+/* determine minimal and optimal sizes of the */
+/* workspace at any moment of the run. */
+	if (*n == 0 || *n == 1) {
+/* All output except K is void. INFO=1 signals */
+/* the void input. In case of a workspace query, */
+/* the minimal workspace lengths are returned. */
+	    if (lquery) {
+		iwork[1] = 1;
+		zwork[1].r = 2., zwork[1].i = 0.;
+		zwork[2].r = 2., zwork[2].i = 0.;
+		work[1] = 2.;
+		work[2] = 2.;
+	    } else {
+		*k = 0;
+	    }
+	    *info = 1;
+	    return 0;
+	}
+	mlrwrk = 2;
+	mlwork = 2;
+	olwork = 2;
+	iminwr = 1;
+	mlwqr = f2cmax(1,*n);
+/* Minimal workspace length for ZGEQRF. */
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwqr;
+	mlwork = f2cmax(i__1,i__2);
+	if (lquery) {
+	    zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[1], &c_n1, &
+		    info1);
+	    olwqr = (integer) zwork[1].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwqr;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	i__1 = *n - 1;
+	zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], 
+		ldx, &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset]
+		, ldz, &res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[
+		s_offset], lds, &zwork[1], &c_n1, &work[1], &c_n1, &iwork[1], 
+		&c_n1, &info1);
+	mlwdmd = (integer) zwork[1].r;
+/* Computing MAX */
+	i__1 = mlwork, i__2 = minmn + mlwdmd;
+	mlwork = f2cmax(i__1,i__2);
+/* Computing MAX */
+	i__1 = mlrwrk, i__2 = (integer) work[1];
+	mlrwrk = f2cmax(i__1,i__2);
+	iminwr = f2cmax(iminwr,iwork[1]);
+	if (lquery) {
+	    olwdmd = (integer) zwork[2].r;
+/* Computing MAX */
+	    i__1 = olwork, i__2 = minmn + olwdmd;
+	    olwork = f2cmax(i__1,i__2);
+	}
+	if (wntvec || wntvcf) {
+	    mlwmqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwmqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zunmqr_("L", "N", m, n, &minmn, &f[f_offset], ldf, &zwork[1], 
+			&z__[z_offset], ldz, &zwork[1], &c_n1, &info1);
+		olwmqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwmqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (wantq) {
+	    mlwgqr = f2cmax(1,*n);
+/* Computing MAX */
+	    i__1 = mlwork, i__2 = minmn + mlwgqr;
+	    mlwork = f2cmax(i__1,i__2);
+	    if (lquery) {
+		zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &
+			zwork[1], &c_n1, &info1);
+		olwgqr = (integer) zwork[1].r;
+/* Computing MAX */
+		i__1 = olwork, i__2 = minmn + olwgqr;
+		olwork = f2cmax(i__1,i__2);
+	    }
+	}
+	if (*liwork < iminwr && ! lquery) {
+	    *info = -34;
+	}
+	if (*lwork < mlrwrk && ! lquery) {
+	    *info = -32;
+	}
+	if (*lzwork < mlwork && ! lquery) {
+	    *info = -30;
+	}
+    }
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEDMDQ", &i__1);
+	return 0;
+    } else if (lquery) {
+/*     Return minimal and optimal workspace sizes */
+	iwork[1] = iminwr;
+	zwork[1].r = (doublereal) mlwork, zwork[1].i = 0.;
+	zwork[2].r = (doublereal) olwork, zwork[2].i = 0.;
+	work[1] = (doublereal) mlrwrk;
+	work[2] = (doublereal) mlrwrk;
+	return 0;
+    }
+/* ..... */
+/*     Initial QR factorization that is used to represent the */
+/*     snapshots as elements of lower dimensional subspace. */
+/*     For large scale computation with M >> N, at this place */
+/*     one can use an out of core QRF. */
+
+    i__1 = *lzwork - minmn;
+    zgeqrf_(m, n, &f[f_offset], ldf, &zwork[1], &zwork[minmn + 1], &i__1, &
+	    info1);
+
+/*     Define X and Y as the snapshots representations in the */
+/*     orthogonal basis computed in the QR factorization. */
+/*     X corresponds to the leading N-1 and Y to the trailing */
+/*     N-1 snapshots. */
+    i__1 = *n - 1;
+    zlaset_("L", &minmn, &i__1, &zzero, &zzero, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    zlacpy_("U", &minmn, &i__1, &f[f_offset], ldf, &x[x_offset], ldx);
+    i__1 = *n - 1;
+    zlacpy_("A", &minmn, &i__1, &f[(f_dim1 << 1) + 1], ldf, &y[y_offset], ldy);
+    if (*m >= 3) {
+	i__1 = minmn - 2;
+	i__2 = *n - 2;
+	zlaset_("L", &i__1, &i__2, &zzero, &zzero, &y[y_dim1 + 3], ldy);
+    }
+
+/*     Compute the DMD of the projected snapshot pairs (X,Y) */
+    i__1 = *n - 1;
+    i__2 = *lzwork - minmn;
+    zgedmd_(jobs, jobvl, jobr, jobf, whtsvd, &minmn, &i__1, &x[x_offset], ldx,
+	     &y[y_offset], ldy, nrnk, tol, k, &eigs[1], &z__[z_offset], ldz, &
+	    res[1], &b[b_offset], ldb, &v[v_offset], ldv, &s[s_offset], lds, &
+	    zwork[minmn + 1], &i__2, &work[1], lwork, &iwork[1], liwork, &
+	    info1);
+    if (info1 == 2 || info1 == 3) {
+/* Return with error code. See ZGEDMD for details. */
+	*info = info1;
+	return 0;
+    } else {
+	*info = info1;
+    }
+
+/*     The Ritz vectors (Koopman modes) can be explicitly */
+/*     formed or returned in factored form. */
+    if (wntvec) {
+/* Compute the eigenvectors explicitly. */
+	if (*m > minmn) {
+	    i__1 = *m - minmn;
+	    zlaset_("A", &i__1, k, &zzero, &zzero, &z__[minmn + 1 + z_dim1], 
+		    ldz);
+	}
+	i__1 = *lzwork - minmn;
+	zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    } else if (wntvcf) {
+/*   Return the Ritz vectors (eigenvectors) in factored */
+/*   form Z*V, where Z contains orthonormal matrix (the */
+/*   product of Q from the initial QR factorization and */
+/*   the SVD/POD_basis returned by ZGEDMD in X) and the */
+/*   second factor (the eigenvectors of the Rayleigh */
+/*   quotient) is in the array V, as returned by ZGEDMD. */
+	zlacpy_("A", n, k, &x[x_offset], ldx, &z__[z_offset], ldz);
+	if (*m > *n) {
+	    i__1 = *m - *n;
+	    zlaset_("A", &i__1, k, &zzero, &zzero, &z__[*n + 1 + z_dim1], ldz);
+	}
+	i__1 = *lzwork - minmn;
+	zunmqr_("L", "N", m, k, &minmn, &f[f_offset], ldf, &zwork[1], &z__[
+		z_offset], ldz, &zwork[minmn + 1], &i__1, &info1);
+    }
+
+/*     Some optional output variables: */
+
+/*     The upper triangular factor R in the initial QR */
+/*     factorization is optionally returned in the array Y. */
+/*     This is useful if this call to ZGEDMDQ is to be */
+/*     followed by a streaming DMD that is implemented in a */
+/*     QR compressed form. */
+    if (wnttrf) {
+/* Return the upper triangular R in Y */
+	zlaset_("A", &minmn, n, &zzero, &zzero, &y[y_offset], ldy);
+	zlacpy_("U", &minmn, n, &f[f_offset], ldf, &y[y_offset], ldy);
+    }
+
+/*     The orthonormal/unitary factor Q in the initial QR */
+/*     factorization is optionally returned in the array F. */
+/*     Same as with the triangular factor above, this is */
+/*     useful in a streaming DMD. */
+    if (wantq) {
+/* Q overwrites F */
+	i__1 = *lzwork - minmn;
+	zungqr_(m, &minmn, &minmn, &f[f_offset], ldf, &zwork[1], &zwork[minmn 
+		+ 1], &i__1, &info1);
+    }
+
+    return 0;
+
+} /* zgedmdq_ */
+
diff --git a/lapack-netlib/SRC/zgedmdq.f90 b/lapack-netlib/SRC/zgedmdq.f90
new file mode 100644
index 000000000..51be72a32
--- /dev/null
+++ b/lapack-netlib/SRC/zgedmdq.f90
@@ -0,0 +1,689 @@
+SUBROUTINE ZGEDMDQ( JOBS,  JOBZ, JOBR, JOBQ, JOBT, JOBF,   &
+                    WHTSVD,   M, N, F, LDF,  X, LDX,  Y,   &
+                    LDY,   NRNK,  TOL,   K,  EIGS,         &
+                    Z, LDZ, RES,  B,     LDB,   V, LDV,    & 
+                    S, LDS, ZWORK, LZWORK, WORK,  LWORK,   &
+                    IWORK, LIWORK, INFO )
+! March 2023
+!.....
+      USE                   iso_fortran_env
+      IMPLICIT NONE
+      INTEGER, PARAMETER :: WP = real64
+!.....      
+!     Scalar arguments       
+      CHARACTER, INTENT(IN)  :: JOBS, JOBZ, JOBR, JOBQ,    &
+                                JOBT, JOBF
+      INTEGER,   INTENT(IN)  :: WHTSVD, M, N,   LDF, LDX,  &
+                                LDY, NRNK, LDZ, LDB, LDV,  &
+                                LDS, LZWORK,  LWORK, LIWORK
+      INTEGER,   INTENT(OUT) :: INFO,   K      
+      REAL(KIND=WP), INTENT(IN)    ::   TOL     
+!     Array arguments      
+      COMPLEX(KIND=WP), INTENT(INOUT) :: F(LDF,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: X(LDX,*), Y(LDY,*), &
+                                         Z(LDZ,*), B(LDB,*), &
+                                         V(LDV,*), S(LDS,*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: EIGS(*)
+      COMPLEX(KIND=WP), INTENT(OUT)   :: ZWORK(*)
+      REAL(KIND=WP), INTENT(OUT)   :: RES(*)
+      REAL(KIND=WP), INTENT(OUT)   :: WORK(*)  
+      INTEGER,       INTENT(OUT)   :: IWORK(*)
+!.....      
+!     Purpose  
+!     =======
+!     ZGEDMDQ computes the Dynamic Mode Decomposition (DMD) for
+!     a pair of data snapshot matrices, using a QR factorization
+!     based compression of the data. For the input matrices
+!     X and Y such that Y = A*X with an unaccessible matrix
+!     A, ZGEDMDQ computes a certain number of Ritz pairs of A using
+!     the standard Rayleigh-Ritz extraction from a subspace of
+!     range(X) that is determined using the leading left singular 
+!     vectors of X. Optionally, ZGEDMDQ returns the residuals 
+!     of the computed Ritz pairs, the information needed for
+!     a refinement of the Ritz vectors, or the eigenvectors of
+!     the Exact DMD.
+!     For further details see the references listed
+!     below. For more details of the implementation see [3].      
+!
+!     References
+!     ==========
+!     [1] P. Schmid: Dynamic mode decomposition of numerical
+!         and experimental data,
+!         Journal of Fluid Mechanics 656, 5-28, 2010.
+!     [2] Z. Drmac, I. Mezic, R. Mohr: Data driven modal
+!         decompositions: analysis and enhancements,
+!         SIAM J. on Sci. Comp. 40 (4), A2253-A2285, 2018.
+!     [3] Z. Drmac: A LAPACK implementation of the Dynamic
+!         Mode Decomposition I. Technical report. AIMDyn Inc.
+!         and LAPACK Working Note 298.      
+!     [4] J. Tu, C. W. Rowley, D. M. Luchtenburg, S. L. 
+!         Brunton, N. Kutz: On Dynamic Mode Decomposition:
+!         Theory and Applications, Journal of Computational
+!         Dynamics 1(2), 391 -421, 2014.
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     and supported by
+!     - DARPA SBIR project "Koopman Operator-Based Forecasting
+!     for Nonstationary Processes from Near-Term, Limited
+!     Observational Data" Contract No: W31P4Q-21-C-0007
+!     - DARPA PAI project "Physics-Informed Machine Learning
+!     Methodologies" Contract No: HR0011-18-9-0033
+!     - DARPA MoDyL project "A Data-Driven, Operator-Theoretic
+!     Framework for Space-Time Analysis of Process Dynamics"
+!     Contract No: HR0011-16-C-0116
+!     Any opinions, findings and conclusions or recommendations 
+!     expressed in this material are those of the author and 
+!     do not necessarily reflect the views of the DARPA SBIR 
+!     Program Office.      
+!============================================================
+!     Distribution Statement A: 
+!     Approved for Public Release, Distribution Unlimited.
+!     Cleared by DARPA on September 29, 2022
+!============================================================      
+!......................................................................      
+!     Arguments
+!     =========
+!     JOBS (input) CHARACTER*1
+!     Determines whether the initial data snapshots are scaled
+!     by a diagonal matrix. The data snapshots are the columns
+!     of F. The leading N-1 columns of F are denoted X and the
+!     trailing N-1 columns are denoted Y. 
+!     'S' :: The data snapshots matrices X and Y are multiplied
+!            with a diagonal matrix D so that X*D has unit
+!            nonzero columns (in the Euclidean 2-norm)
+!     'C' :: The snapshots are scaled as with the 'S' option.
+!            If it is found that an i-th column of X is zero
+!            vector and the corresponding i-th column of Y is
+!            non-zero, then the i-th column of Y is set to
+!            zero and a warning flag is raised.
+!     'Y' :: The data snapshots matrices X and Y are multiplied
+!            by a diagonal matrix D so that Y*D has unit
+!            nonzero columns (in the Euclidean 2-norm)    
+!     'N' :: No data scaling.   
+!.....
+!     JOBZ (input) CHARACTER*1
+!     Determines whether the eigenvectors (Koopman modes) will
+!     be computed.
+!     'V' :: The eigenvectors (Koopman modes) will be computed
+!            and returned in the matrix Z.
+!            See the description of Z.
+!     'F' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Z*V, where Z
+!            is orthonormal and V contains the eigenvectors
+!            of the corresponding Rayleigh quotient.
+!            See the descriptions of F, V, Z.
+!     'Q' :: The eigenvectors (Koopman modes) will be returned
+!            in factored form as the product Q*Z, where Z
+!            contains the eigenvectors of the compression of the
+!            underlying discretized operator onto the span of
+!            the data snapshots. See the descriptions of F, V, Z.
+!            Q is from the initial QR factorization.  
+!     'N' :: The eigenvectors are not computed.  
+!.....      
+!     JOBR (input) CHARACTER*1 
+!     Determines whether to compute the residuals.
+!     'R' :: The residuals for the computed eigenpairs will
+!            be computed and stored in the array RES.
+!            See the description of RES.
+!            For this option to be legal, JOBZ must be 'V'.
+!     'N' :: The residuals are not computed.
+!.....
+!     JOBQ (input) CHARACTER*1 
+!     Specifies whether to explicitly compute and return the
+!     unitary matrix from the QR factorization.
+!     'Q' :: The matrix Q of the QR factorization of the data
+!            snapshot matrix is computed and stored in the
+!            array F. See the description of F.       
+!     'N' :: The matrix Q is not explicitly computed.
+!.....
+!     JOBT (input) CHARACTER*1 
+!     Specifies whether to return the upper triangular factor
+!     from the QR factorization.
+!     'R' :: The matrix R of the QR factorization of the data 
+!            snapshot matrix F is returned in the array Y.
+!            See the description of Y and Further details.       
+!     'N' :: The matrix R is not returned. 
+!.....
+!     JOBF (input) CHARACTER*1
+!     Specifies whether to store information needed for post-
+!     processing (e.g. computing refined Ritz vectors)
+!     'R' :: The matrix needed for the refinement of the Ritz
+!            vectors is computed and stored in the array B.
+!            See the description of B.
+!     'E' :: The unscaled eigenvectors of the Exact DMD are 
+!            computed and returned in the array B. See the
+!            description of B.
+!     'N' :: No eigenvector refinement data is computed.   
+!     To be useful on exit, this option needs JOBQ='Q'.    
+!.....
+!     WHTSVD (input) INTEGER, WHSTVD in { 1, 2, 3, 4 }
+!     Allows for a selection of the SVD algorithm from the
+!     LAPACK library.
+!     1 :: ZGESVD (the QR SVD algorithm)
+!     2 :: ZGESDD (the Divide and Conquer algorithm; if enough
+!          workspace available, this is the fastest option)
+!     3 :: ZGESVDQ (the preconditioned QR SVD  ; this and 4
+!          are the most accurate options)
+!     4 :: ZGEJSV (the preconditioned Jacobi SVD; this and 3
+!          are the most accurate options)
+!     For the four methods above, a significant difference in
+!     the accuracy of small singular values is possible if
+!     the snapshots vary in norm so that X is severely
+!     ill-conditioned. If small (smaller than EPS*||X||)
+!     singular values are of interest and JOBS=='N',  then
+!     the options (3, 4) give the most accurate results, where
+!     the option 4 is slightly better and with stronger 
+!     theoretical background.
+!     If JOBS=='S', i.e. the columns of X will be normalized,
+!     then all methods give nearly equally accurate results.
+!.....
+!     M (input) INTEGER, M >= 0 
+!     The state space dimension (the number of rows of F).
+!.....      
+!     N (input) INTEGER, 0 <= N <= M
+!     The number of data snapshots from a single trajectory,
+!     taken at equidistant discrete times. This is the 
+!     number of columns of F.
+!.....
+!     F (input/output) COMPLEX(KIND=WP) M-by-N array
+!     > On entry,
+!     the columns of F are the sequence of data snapshots 
+!     from a single trajectory, taken at equidistant discrete
+!     times. It is assumed that the column norms of F are 
+!     in the range of the normalized floating point numbers. 
+!     < On exit,
+!     If JOBQ == 'Q', the array F contains the orthogonal 
+!     matrix/factor of the QR factorization of the initial 
+!     data snapshots matrix F. See the description of JOBQ. 
+!     If JOBQ == 'N', the entries in F strictly below the main
+!     diagonal contain, column-wise, the information on the 
+!     Householder vectors, as returned by ZGEQRF. The 
+!     remaining information to restore the orthogonal matrix
+!     of the initial QR factorization is stored in ZWORK(1:MIN(M,N)). 
+!     See the description of ZWORK.
+!.....
+!     LDF (input) INTEGER, LDF >= M 
+!     The leading dimension of the array F.
+!.....
+!     X (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N-1) array
+!     X is used as workspace to hold representations of the
+!     leading N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, the leading K columns of X contain the leading
+!     K left singular vectors of the above described content
+!     of X. To lift them to the space of the left singular
+!     vectors U(:,1:K) of the input data, pre-multiply with the 
+!     Q factor from the initial QR factorization. 
+!     See the descriptions of F, K, V  and Z.
+!.....      
+!     LDX (input) INTEGER, LDX >= N  
+!     The leading dimension of the array X. 
+!.....
+!     Y (workspace/output) COMPLEX(KIND=WP) MIN(M,N)-by-(N) array
+!     Y is used as workspace to hold representations of the
+!     trailing N-1 snapshots in the orthonormal basis computed
+!     in the QR factorization of F.
+!     On exit, 
+!     If JOBT == 'R', Y contains the MIN(M,N)-by-N upper
+!     triangular factor from the QR factorization of the data
+!     snapshot matrix F.
+!.....      
+!     LDY (input) INTEGER , LDY >= N
+!     The leading dimension of the array Y.   
+!.....
+!     NRNK (input) INTEGER
+!     Determines the mode how to compute the numerical rank,
+!     i.e. how to truncate small singular values of the input
+!     matrix X. On input, if
+!     NRNK = -1 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(1)
+!                  This option is recommended.  
+!     NRNK = -2 :: i-th singular value sigma(i) is truncated
+!                  if sigma(i) <= TOL*sigma(i-1)
+!                  This option is included for R&D purposes.
+!                  It requires highly accurate SVD, which
+!                  may not be feasible.      
+!     The numerical rank can be enforced by using positive 
+!     value of NRNK as follows: 
+!     0 < NRNK <= N-1 :: at most NRNK largest singular values
+!     will be used. If the number of the computed nonzero
+!     singular values is less than NRNK, then only those
+!     nonzero values will be used and the actually used
+!     dimension is less than NRNK. The actual number of
+!     the nonzero singular values is returned in the variable
+!     K. See the description of K.
+!.....
+!     TOL (input) REAL(KIND=WP), 0 <= TOL < 1
+!     The tolerance for truncating small singular values.
+!     See the description of NRNK.  
+!.....
+!     K (output) INTEGER,  0 <= K <= N 
+!     The dimension of the SVD/POD basis for the leading N-1
+!     data snapshots (columns of F) and the number of the 
+!     computed Ritz pairs. The value of K is determined
+!     according to the rule set by the parameters NRNK and 
+!     TOL. See the descriptions of NRNK and TOL. 
+!.....
+!     EIGS (output) COMPLEX(KIND=WP) (N-1)-by-1 array
+!     The leading K (K<=N-1) entries of EIGS contain
+!     the computed eigenvalues (Ritz values).
+!     See the descriptions of K, and Z.
+!.....
+!     Z (workspace/output) COMPLEX(KIND=WP)  M-by-(N-1) array
+!     If JOBZ =='V' then Z contains the Ritz vectors. Z(:,i)
+!     is an eigenvector of the i-th Ritz value; ||Z(:,i)||_2=1.
+!     If JOBZ == 'F', then the Z(:,i)'s are given implicitly as
+!     Z*V, where Z contains orthonormal matrix (the product of
+!     Q from the initial QR factorization and the SVD/POD_basis
+!     returned by ZGEDMD in X) and the second factor (the 
+!     eigenvectors of the Rayleigh quotient) is in the array V, 
+!     as returned by ZGEDMD. That is,  X(:,1:K)*V(:,i)
+!     is an eigenvector corresponding to EIGS(i). The columns 
+!     of V(1:K,1:K) are the computed eigenvectors of the 
+!     K-by-K Rayleigh quotient.  
+!     See the descriptions of EIGS, X and V.      
+!.....
+!     LDZ (input) INTEGER , LDZ >= M
+!     The leading dimension of the array Z.
+!.....
+!     RES (output) REAL(KIND=WP) (N-1)-by-1 array
+!     RES(1:K) contains the residuals for the K computed 
+!     Ritz pairs, 
+!     RES(i) = || A * Z(:,i) - EIGS(i)*Z(:,i))||_2.
+!     See the description of EIGS and Z.      
+!.....
+!     B (output) COMPLEX(KIND=WP)  MIN(M,N)-by-(N-1) array.
+!     IF JOBF =='R', B(1:N,1:K) contains A*U(:,1:K), and can
+!     be used for computing the refined vectors; see further 
+!     details in the provided references. 
+!     If JOBF == 'E', B(1:N,1;K) contains 
+!     A*U(:,1:K)*W(1:K,1:K), which are the vectors from the
+!     Exact DMD, up to scaling by the inverse eigenvalues.   
+!     In both cases, the content of B can be lifted to the 
+!     original dimension of the input data by pre-multiplying
+!     with the Q factor from the initial QR factorization.   
+!     Here A denotes a compression of the underlying operator.      
+!     See the descriptions of F and X.
+!     If JOBF =='N', then B is not referenced.
+!.....
+!     LDB (input) INTEGER, LDB >= MIN(M,N)
+!     The leading dimension of the array B.
+!.....
+!     V (workspace/output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     On exit, V(1:K,1:K) V contains the K eigenvectors of
+!     the Rayleigh quotient. The Ritz vectors
+!     (returned in Z) are the product of Q from the initial QR
+!     factorization (see the description of F) X (see the 
+!     description of X) and V.
+!.....
+!     LDV (input) INTEGER, LDV >= N-1
+!     The leading dimension of the array V.
+!.....      
+!     S (output) COMPLEX(KIND=WP) (N-1)-by-(N-1) array
+!     The array S(1:K,1:K) is used for the matrix Rayleigh
+!     quotient. This content is overwritten during
+!     the eigenvalue decomposition by ZGEEV.
+!     See the description of K.
+!.....
+!     LDS (input) INTEGER, LDS >= N-1        
+!     The leading dimension of the array S.
+!.....
+!     ZWORK (workspace/output) COMPLEX(KIND=WP) LWORK-by-1 array
+!     On exit, 
+!     ZWORK(1:MIN(M,N)) contains the scalar factors of the 
+!     elementary reflectors as returned by ZGEQRF of the 
+!     M-by-N input matrix F.   
+!     If the call to ZGEDMDQ is only workspace query, then
+!     ZWORK(1) contains the minimal complex workspace length and
+!     ZWORK(2) is the optimal complex workspace length. 
+!     Hence, the length of work is at least 2.
+!     See the description of LZWORK.      
+!.....      
+!     LZWORK (input) INTEGER
+!     The minimal length of the  workspace vector ZWORK.
+!     LZWORK is calculated as follows:
+!     Let MLWQR  = N (minimal workspace for ZGEQRF[M,N])
+!         MLWDMD = minimal workspace for ZGEDMD (see the
+!                  description of LWORK in ZGEDMD)
+!         MLWMQR = N (minimal workspace for 
+!                    ZUNMQR['L','N',M,N,N])
+!         MLWGQR = N (minimal workspace for ZUNGQR[M,N,N])
+!         MINMN  = MIN(M,N)      
+!     Then
+!     LZWORK = MAX(2, MIN(M,N)+MLWQR, MINMN+MLWDMD)
+!     is further updated as follows:
+!        if   JOBZ == 'V' or JOBZ == 'F' THEN 
+!             LZWORK = MAX(LZWORK, MINMN+MLWMQR)
+!        if   JOBQ == 'Q' THEN
+!             LZWORK = MAX(ZLWORK, MINMN+MLWGQR)      
+!
+!.....      
+!     WORK (workspace/output) REAL(KIND=WP) LWORK-by-1 array
+!     On exit,
+!     WORK(1:N-1) contains the singular values of 
+!     the input submatrix F(1:M,1:N-1).
+!     If the call to ZGEDMDQ is only workspace query, then
+!     WORK(1) contains the minimal workspace length and
+!     WORK(2) is the optimal workspace length. hence, the
+!     length of work is at least 2.
+!     See the description of LWORK.
+!.....
+!     LWORK (input) INTEGER
+!     The minimal length of the  workspace vector WORK.
+!     LWORK is the same as in ZGEDMD, because in ZGEDMDQ
+!     only ZGEDMD requires real workspace for snapshots
+!     of dimensions MIN(M,N)-by-(N-1). 
+!     If on entry LWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace length for WORK.          
+!.....
+!     IWORK (workspace/output) INTEGER LIWORK-by-1 array
+!     Workspace that is required only if WHTSVD equals
+!     2 , 3 or 4. (See the description of WHTSVD).
+!     If on entry LWORK =-1 or LIWORK=-1, then the
+!     minimal length of IWORK is computed and returned in
+!     IWORK(1). See the description of LIWORK.
+!.....
+!     LIWORK (input) INTEGER
+!     The minimal length of the workspace vector IWORK.
+!     If WHTSVD == 1, then only IWORK(1) is used; LIWORK >=1
+!     Let M1=MIN(M,N), N1=N-1. Then
+!     If WHTSVD == 2, then LIWORK >= MAX(1,8*MIN(M1,N1))
+!     If WHTSVD == 3, then LIWORK >= MAX(1,M1+N1-1)
+!     If WHTSVD == 4, then LIWORK >= MAX(3,M1+3*N1)
+!     If on entry LIWORK = -1, then a workspace query is
+!     assumed and the procedure only computes the minimal
+!     and the optimal workspace lengths for both WORK and
+!     IWORK. See the descriptions of WORK and IWORK.
+!..... 
+!     INFO (output) INTEGER
+!     -i < 0 :: On entry, the i-th argument had an
+!               illegal value
+!        = 0 :: Successful return.
+!        = 1 :: Void input. Quick exit (M=0 or N=0).
+!        = 2 :: The SVD computation of X did not converge.
+!               Suggestion: Check the input data and/or
+!               repeat with different WHTSVD.
+!        = 3 :: The computation of the eigenvalues did not
+!               converge.
+!        = 4 :: If data scaling was requested on input and
+!               the procedure found inconsistency in the data
+!               such that for some column index i,
+!               X(:,i) = 0 but Y(:,i) /= 0, then Y(:,i) is set
+!               to zero if JOBS=='C'. The computation proceeds
+!               with original or modified data and warning
+!               flag is set with INFO=4.  
+!.............................................................
+!.............................................................
+!     Parameters
+!     ~~~~~~~~~~      
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!     COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!      
+!     Local scalars      
+!     ~~~~~~~~~~~~~
+      INTEGER           :: IMINWR, INFO1,  MINMN, MLRWRK,   &
+                           MLWDMD, MLWGQR, MLWMQR, MLWORK,  & 
+                           MLWQR,  OLWDMD, OLWGQR, OLWMQR,  &
+                           OLWORK, OLWQR
+      LOGICAL           :: LQUERY, SCCOLX, SCCOLY, WANTQ,  &
+                           WNTTRF, WNTRES, WNTVEC, WNTVCF, &
+                           WNTVCQ, WNTREF, WNTEX
+      CHARACTER(LEN=1)  :: JOBVL
+!      
+!     External functions (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~
+      LOGICAL       LSAME
+      EXTERNAL      LSAME 
+!
+!     External subroutines (BLAS and LAPACK)
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      ZGEQRF, ZLACPY, ZLASET, ZUNGQR, & 
+                    ZUNMQR, XERBLA
+
+!     External subroutines
+!     ~~~~~~~~~~~~~~~~~~~~
+      EXTERNAL      ZGEDMD 
+      
+!     Intrinsic functions
+!     ~~~~~~~~~~~~~~~~~~~
+      INTRINSIC      MAX, MIN, INT         
+ !..........................................................  
+ !
+ !    Test the input arguments    
+      WNTRES = LSAME(JOBR,'R')
+      SCCOLX = LSAME(JOBS,'S') .OR. LSAME( JOBS, 'C' )
+      SCCOLY = LSAME(JOBS,'Y')
+      WNTVEC = LSAME(JOBZ,'V')
+      WNTVCF = LSAME(JOBZ,'F')
+      WNTVCQ = LSAME(JOBZ,'Q') 
+      WNTREF = LSAME(JOBF,'R') 
+      WNTEX  = LSAME(JOBF,'E')
+      WANTQ  = LSAME(JOBQ,'Q')
+      WNTTRF = LSAME(JOBT,'R')     
+      MINMN  = MIN(M,N)
+      INFO = 0 
+      LQUERY = ( (LZWORK == -1) .OR. (LWORK == -1) .OR. (LIWORK == -1) )
+!       
+      IF ( .NOT. (SCCOLX .OR. SCCOLY .OR.                &
+                                  LSAME(JOBS,'N')) )  THEN 
+          INFO = -1
+      ELSE IF ( .NOT. (WNTVEC .OR. WNTVCF .OR. WNTVCQ    &
+                              .OR. LSAME(JOBZ,'N')) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT. (WNTRES .OR. LSAME(JOBR,'N')) .OR.    & 
+          ( WNTRES .AND. LSAME(JOBZ,'N') ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT. (WANTQ .OR. LSAME(JOBQ,'N')) ) THEN
+           INFO = -4                 
+      ELSE IF ( .NOT. ( WNTTRF .OR. LSAME(JOBT,'N') ) )  THEN
+          INFO = -5
+       ELSE IF ( .NOT. (WNTREF .OR. WNTEX .OR.             & 
+                LSAME(JOBF,'N') ) )                     THEN
+          INFO = -6    
+      ELSE IF ( .NOT. ((WHTSVD == 1).OR.(WHTSVD == 2).OR.   &
+                       (WHTSVD == 3).OR.(WHTSVD == 4)) ) THEN
+          INFO = -7
+      ELSE IF ( M < 0 ) THEN
+          INFO = -8
+      ELSE IF ( ( N < 0 ) .OR. ( N > M+1 ) ) THEN
+          INFO = -9
+      ELSE IF ( LDF < M ) THEN
+          INFO = -11
+      ELSE IF ( LDX < MINMN ) THEN
+          INFO = -13
+      ELSE IF ( LDY < MINMN ) THEN
+          INFO = -15
+      ELSE IF ( .NOT. (( NRNK == -2).OR.(NRNK == -1).OR.    & 
+                       ((NRNK >= 1).AND.(NRNK <=N ))) )  THEN
+          INFO = -16
+      ELSE IF ( ( TOL < ZERO ) .OR. ( TOL >= ONE ) ) THEN
+          INFO = -17
+      ELSE IF ( LDZ < M ) THEN
+          INFO = -21
+      ELSE IF ( (WNTREF.OR.WNTEX ).AND.( LDB < MINMN ) ) THEN
+          INFO = -24
+      ELSE IF ( LDV < N-1 ) THEN
+          INFO = -26
+      ELSE IF ( LDS < N-1 ) THEN
+          INFO = -28
+      END IF
+!      
+      IF ( WNTVEC .OR. WNTVCF .OR. WNTVCQ ) THEN
+          JOBVL = 'V'
+      ELSE
+          JOBVL = 'N'
+      END IF     
+      IF ( INFO == 0 ) THEN  
+          ! Compute the minimal and the optimal workspace
+          ! requirements. Simulate running the code and 
+          ! determine minimal and optimal sizes of the 
+          ! workspace at any moment of the run.         
+         IF ( ( N == 0 ) .OR. ( N == 1 ) ) THEN
+             ! All output except K is void. INFO=1 signals
+             ! the void input. In case of a workspace query,
+             ! the minimal workspace lengths are returned.
+            IF ( LQUERY ) THEN  
+               IWORK(1) = 1
+               ZWORK(1) = 2
+               ZWORK(2) = 2
+               WORK(1)  = 2
+               WORK(2)  = 2
+            ELSE                
+               K = 0
+            END IF             
+            INFO = 1  
+            RETURN
+         END IF   
+         
+         MLRWRK = 2
+         MLWORK = 2
+         OLWORK = 2 
+         IMINWR = 1
+         MLWQR  = MAX(1,N)  ! Minimal workspace length for ZGEQRF.
+         MLWORK = MAX(MLWORK,MINMN + MLWQR) 
+         
+         IF ( LQUERY ) THEN 
+             CALL ZGEQRF( M, N, F, LDF, ZWORK, ZWORK, -1, &
+                          INFO1 )
+             OLWQR  = INT(ZWORK(1))
+             OLWORK = MAX(OLWORK,MINMN + OLWQR)         
+         END IF
+         CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN,& 
+                      N-1, X, LDX, Y, LDY, NRNK, TOL, K,     & 
+                      EIGS, Z, LDZ, RES,  B, LDB, V, LDV,    & 
+                      S, LDS, ZWORK, -1, WORK, -1, IWORK,&
+                      -1, INFO1 )
+         MLWDMD = INT(ZWORK(1))
+         MLWORK = MAX(MLWORK, MINMN + MLWDMD)
+         MLRWRK = MAX(MLRWRK, INT(WORK(1)))
+         IMINWR = MAX(IMINWR, IWORK(1))
+         IF ( LQUERY ) THEN 
+             OLWDMD = INT(ZWORK(2))
+             OLWORK = MAX(OLWORK, MINMN+OLWDMD)
+         END IF
+         IF ( WNTVEC .OR. WNTVCF ) THEN
+            MLWMQR = MAX(1,N) 
+            MLWORK = MAX(MLWORK,MINMN+MLWMQR)
+            IF ( LQUERY ) THEN
+               CALL ZUNMQR( 'L','N', M, N, MINMN, F, LDF,  & 
+                            ZWORK, Z, LDZ, ZWORK, -1, INFO1 )
+               OLWMQR = INT(ZWORK(1))
+               OLWORK = MAX(OLWORK,MINMN+OLWMQR)
+            END IF
+         END IF  
+         IF ( WANTQ ) THEN
+            MLWGQR = MAX(1,N)
+            MLWORK = MAX(MLWORK,MINMN+MLWGQR)
+            IF ( LQUERY ) THEN 
+                CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK, &
+                             ZWORK, -1, INFO1 )               
+                OLWGQR = INT(ZWORK(1))
+                OLWORK = MAX(OLWORK,MINMN+OLWGQR)
+            END IF            
+         END IF         
+         IF ( LIWORK < IMINWR .AND. (.NOT.LQUERY) ) INFO = -34
+         IF ( LWORK  < MLRWRK .AND. (.NOT.LQUERY) ) INFO = -32
+         IF ( LZWORK < MLWORK .AND. (.NOT.LQUERY) ) INFO = -30
+      END IF  
+      IF( INFO /= 0 ) THEN
+         CALL XERBLA( 'ZGEDMDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+!     Return minimal and optimal workspace sizes
+          IWORK(1) = IMINWR
+          ZWORK(1) = MLWORK
+          ZWORK(2) = OLWORK
+          WORK(1)  = MLRWRK
+          WORK(2)  = MLRWRK
+          RETURN
+      END IF   
+!.....	  
+!     Initial QR factorization that is used to represent the
+!     snapshots as elements of lower dimensional subspace.
+!     For large scale computation with M >> N, at this place 
+!     one can use an out of core QRF.
+!   
+      CALL ZGEQRF( M, N, F, LDF, ZWORK,                & 
+                   ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+!      
+!     Define X and Y as the snapshots representations in the
+!     orthogonal basis computed in the QR factorization.
+!     X corresponds to the leading N-1 and Y to the trailing
+!     N-1 snapshots.
+      CALL ZLASET( 'L', MINMN, N-1, ZZERO,  ZZERO, X, LDX )
+      CALL ZLACPY( 'U', MINMN, N-1, F,      LDF, X, LDX )
+      CALL ZLACPY( 'A', MINMN, N-1, F(1,2), LDF, Y, LDY )
+      IF ( M >= 3 ) THEN
+          CALL ZLASET( 'L', MINMN-2, N-2, ZZERO,  ZZERO, &
+                       Y(3,1), LDY )  
+      END IF
+!
+!     Compute the DMD of the projected snapshot pairs (X,Y)   
+      CALL ZGEDMD( JOBS, JOBVL, JOBR, JOBF, WHTSVD, MINMN, &
+                  N-1,  X, LDX, Y, LDY, NRNK,   TOL, K,    &
+                  EIGS, Z, LDZ, RES, B,  LDB,   V, LDV,    &
+                  S, LDS, ZWORK(MINMN+1), LZWORK-MINMN, & 
+                  WORK,   LWORK, IWORK, LIWORK, INFO1 )
+      IF ( INFO1 == 2 .OR. INFO1 == 3 ) THEN
+          ! Return with error code. See ZGEDMD for details.
+          INFO = INFO1
+          RETURN
+      ELSE
+          INFO = INFO1
+      END IF    
+!      
+!     The Ritz vectors (Koopman modes) can be explicitly 
+!     formed or returned in factored form.
+      IF ( WNTVEC ) THEN
+        ! Compute the eigenvectors explicitly.  
+        IF ( M > MINMN ) CALL ZLASET( 'A', M-MINMN, K, ZZERO, &
+                                     ZZERO, Z(MINMN+1,1), LDZ )
+        CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z,  &
+             LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      ELSE IF ( WNTVCF ) THEN   
+        !   Return the Ritz vectors (eigenvectors) in factored
+        !   form Z*V, where Z contains orthonormal matrix (the
+        !   product of Q from the initial QR factorization and 
+        !   the SVD/POD_basis returned by ZGEDMD in X) and the 
+        !   second factor (the eigenvectors of the Rayleigh 
+        !   quotient) is in the array V, as returned by ZGEDMD.
+        CALL ZLACPY( 'A', N, K, X, LDX, Z, LDZ )
+        IF ( M > N ) CALL ZLASET( 'A', M-N, K, ZZERO, ZZERO, & 
+                                 Z(N+1,1), LDZ )
+        CALL ZUNMQR( 'L','N', M, K, MINMN, F, LDF, ZWORK, Z, &
+                    LDZ, ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )
+      END IF
+!     
+!     Some optional output variables:
+!
+!     The upper triangular factor R in the initial QR 
+!     factorization is optionally returned in the array Y.
+!     This is useful if this call to ZGEDMDQ is to be 
+!     followed by a streaming DMD that is implemented in a 
+!     QR compressed form.
+      IF ( WNTTRF ) THEN ! Return the upper triangular R in Y 
+         CALL ZLASET( 'A', MINMN, N, ZZERO,  ZZERO, Y, LDY )
+         CALL ZLACPY( 'U', MINMN, N, F, LDF,        Y, LDY )
+      END IF    
+!
+!     The orthonormal/unitary factor Q in the initial QR 
+!     factorization is optionally returned in the array F. 
+!     Same as with the triangular factor above, this is 
+!     useful in a streaming DMD.
+      IF ( WANTQ ) THEN                   ! Q overwrites F 
+         CALL ZUNGQR( M, MINMN, MINMN, F, LDF, ZWORK,     &
+                      ZWORK(MINMN+1), LZWORK-MINMN, INFO1 )  
+      END IF
+!      
+      RETURN
+!      
+      END SUBROUTINE ZGEDMDQ
+    
\ No newline at end of file
diff --git a/lapack-netlib/SRC/zgehrd.f b/lapack-netlib/SRC/zgehrd.f
index e18493cf9..0f4424ded 100644
--- a/lapack-netlib/SRC/zgehrd.f
+++ b/lapack-netlib/SRC/zgehrd.f
@@ -89,7 +89,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gehrd
 *
 *> \par Further Details:
 *  =====================
@@ -173,7 +173,7 @@
       INTEGER            IHI, ILO, INFO, LDA, LWORK, N
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), TAU( * ), WORK( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
 *     ..
 *
 *  =====================================================================
@@ -182,7 +182,7 @@
       INTEGER            NBMAX, LDT, TSIZE
       PARAMETER          ( NBMAX = 64, LDT = NBMAX+1,
      $                     TSIZE = LDT*NBMAX )
-      COMPLEX*16        ZERO, ONE
+      COMPLEX*16         ZERO, ONE
       PARAMETER          ( ZERO = ( 0.0D+0, 0.0D+0 ),
      $                     ONE = ( 1.0D+0, 0.0D+0 ) )
 *     ..
@@ -190,7 +190,7 @@
       LOGICAL            LQUERY
       INTEGER            I, IB, IINFO, IWT, J, LDWORK, LWKOPT, NB,
      $                   NBMIN, NH, NX
-      COMPLEX*16        EI
+      COMPLEX*16         EI
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ZAXPY, ZGEHD2, ZGEMM, ZLAHR2, ZLARFB, ZTRMM,
@@ -221,12 +221,18 @@
          INFO = -8
       END IF
 *
+      NH = IHI - ILO + 1
       IF( INFO.EQ.0 ) THEN
 *
 *        Compute the workspace requirements
 *
-         NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI, -1 ) )
-         LWKOPT = N*NB + TSIZE
+         IF( NH.LE.1 ) THEN
+            LWKOPT = 1
+         ELSE
+            NB = MIN( NBMAX, ILAENV( 1, 'ZGEHRD', ' ', N, ILO, IHI,
+     $                              -1 ) )
+            LWKOPT = N*NB + TSIZE
+         END IF
          WORK( 1 ) = LWKOPT
       ENDIF
 *
@@ -248,7 +254,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -268,7 +273,7 @@
 *
 *           Determine if workspace is large enough for blocked code
 *
-            IF( LWORK.LT.N*NB+TSIZE ) THEN
+            IF( LWORK.LT.LWKOPT ) THEN
 *
 *              Not enough workspace to use optimal NB:  determine the
 *              minimum value of NB, and reduce NB or force use of
diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f
index 5134ea891..b4bc531ab 100644
--- a/lapack-netlib/SRC/zgejsv.f
+++ b/lapack-netlib/SRC/zgejsv.f
@@ -52,10 +52,10 @@
 *> are computed and stored in the arrays U and V, respectively. The diagonal
 *> of [SIGMA] is computed and stored in the array SVA.
 *> \endverbatim
-*>
-*>  Arguments:
-*>  ==========
-*>
+*
+*  Arguments:
+*  ==========
+*
 *> \param[in] JOBA
 *> \verbatim
 *>          JOBA is CHARACTER*1
@@ -151,7 +151,7 @@
 *>         transposed A if A^* seems to be better with respect to convergence.
 *>         If the matrix is not square, JOBT is ignored. 
 *>         The decision is based on two values of entropy over the adjoint
-*>         orbit of A^* * A. See the descriptions of WORK(6) and WORK(7).
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
 *>       = 'T': transpose if entropy test indicates possibly faster
 *>         convergence of Jacobi process if A^* is taken as input. If A is
 *>         replaced with A^*, then the row pivoting is included automatically.
@@ -209,11 +209,11 @@
 *> \verbatim
 *>          SVA is DOUBLE PRECISION array, dimension (N)
 *>          On exit,
-*>          - For WORK(1)/WORK(2) = ONE: The singular values of A. During the
-*>            computation SVA contains Euclidean column norms of the
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
 *>            iterated matrices in the array A.
-*>          - For WORK(1) .NE. WORK(2): The singular values of A are
-*>            (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
 *>            sigma_max(A) overflows or if small singular values have been
 *>            saved from underflow by scaling the input matrix A.
 *>          - If JOBR='R' then some of the singular values may be returned
@@ -252,7 +252,7 @@
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
 *>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
-*>                         then V is used as workspace if the pprocedure
+*>                         then V is used as workspace if the procedure
 *>                         replaces A with A^*. In that case, [U] is computed
 *>                         in V as right singular vectors of A^* and then
 *>                         copied back to the U array. This 'W' option is just
@@ -1821,7 +1821,7 @@
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
 *                 (this overwrites the copy of R2, as it will not be
-*                 needed in this branch, but it does not overwritte the
+*                 needed in this branch, but it does not overwrite the
 *                 Huseholder vectors of Q2.).
                   CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
 *                 .. and the rest of the information on Q3 is in
@@ -1844,7 +1844,7 @@
             END IF
 *
 *        Second preconditioning finished; continue with Jacobi SVD
-*        The input matrix is lower trinagular.
+*        The input matrix is lower triangular.
 *
 *        Recover the right singular vectors as solution of a well
 *        conditioned triangular matrix equation.
@@ -1888,7 +1888,7 @@
             ELSE IF ( CONDR2 .LT. COND_OK ) THEN
 *
 *              The matrix R2 is inverted. The solution of the matrix equation
-*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              is Q3^* * V3 = the product of the Jacobi rotations (applied to
 *              the lower triangular L3 from the LQ factorization of
 *              R2=L3*Q3), pre-multiplied with the transposed Q3.
                CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f
index de7c9a378..86610e801 100644
--- a/lapack-netlib/SRC/zgelq.f
+++ b/lapack-netlib/SRC/zgelq.f
@@ -98,7 +98,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -166,6 +166,8 @@
 *>  the LQ factorization.
 *> \endverbatim
 *>
+*> \ingroup gelq
+*>
 *  =====================================================================
       SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f
index 6c295eece..e988ea818 100644
--- a/lapack-netlib/SRC/zgelqf.f
+++ b/lapack-netlib/SRC/zgelqf.f
@@ -93,7 +93,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,M).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M, otherwise.
 *>          For optimum performance LWORK >= M*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -118,7 +119,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gelqf
 *
 *> \par Further Details:
 *  =====================
@@ -174,9 +175,8 @@
 *     Test the input arguments
 *
       INFO = 0
+      K = MIN( M, N )
       NB = ILAENV( 1, 'ZGELQF', ' ', M, N, -1, -1 )
-      LWKOPT = M*NB
-      WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -184,19 +184,25 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, M ) .AND. .NOT.LQUERY ) THEN
-         INFO = -7
+      ELSE IF( .NOT.LQUERY ) THEN
+         IF( LWORK.LE.0 .OR. ( N.GT.0 .AND. LWORK.LT.MAX( 1, M ) ) )
+     $      INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZGELQF', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
+         IF( K.EQ.0 ) THEN
+            LWKOPT = 1
+         ELSE
+            LWKOPT = M*NB
+         END IF
+         WORK( 1 ) = LWKOPT
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
diff --git a/lapack-netlib/SRC/zgelqt3.f b/lapack-netlib/SRC/zgelqt3.f
index 629a09472..1a71dc44e 100644
--- a/lapack-netlib/SRC/zgelqt3.f
+++ b/lapack-netlib/SRC/zgelqt3.f
@@ -174,7 +174,8 @@
 *
 *        Compute Householder transform when M=1
 *
-         CALL ZLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T )
+         CALL ZLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA,
+     &                T( 1, 1 ) )
          T(1,1)=CONJG(T(1,1))
 *
       ELSE
diff --git a/lapack-netlib/SRC/zgelsd.f b/lapack-netlib/SRC/zgelsd.f
index 01793e16c..b5bc768e8 100644
--- a/lapack-netlib/SRC/zgelsd.f
+++ b/lapack-netlib/SRC/zgelsd.f
@@ -60,12 +60,6 @@
 *> singular values which are less than RCOND times the largest singular
 *> value.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zgelss.f b/lapack-netlib/SRC/zgelss.f
index be53ba95b..afdbaecf0 100644
--- a/lapack-netlib/SRC/zgelss.f
+++ b/lapack-netlib/SRC/zgelss.f
@@ -170,7 +170,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsolve
+*> \ingroup gelss
 *
 *  =====================================================================
       SUBROUTINE ZGELSS( M, N, NRHS, A, LDA, B, LDB, S, RCOND, RANK,
@@ -212,10 +212,9 @@
       COMPLEX*16         DUM( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY,
-     $                   ZDRSCL, ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF,
-     $                   ZLACPY, ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ,
-     $                   ZUNMQR
+      EXTERNAL           DLASCL, DLASET, XERBLA, ZBDSQR, ZCOPY, ZDRSCL,
+     $                   ZGEBRD, ZGELQF, ZGEMM, ZGEMV, ZGEQRF, ZLACPY,
+     $                   ZLASCL, ZLASET, ZUNGBR, ZUNMBR, ZUNMLQ
 *     ..
 *     .. External Functions ..
       INTEGER            ILAENV
@@ -388,7 +387,6 @@
       SFMIN = DLAMCH( 'S' )
       SMLNUM = SFMIN / EPS
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A if max element outside range [SMLNUM,BIGNUM]
 *
@@ -540,7 +538,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL ZLACPY( 'G', N, BL, WORK, N, B( 1, I ), LDB )
    20       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL ZGEMV( 'C', N, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL ZCOPY( N, WORK, 1, B, 1 )
          END IF
@@ -645,7 +643,7 @@
                CALL ZLACPY( 'G', M, BL, WORK( IWORK ), M, B( 1, I ),
      $                      LDB )
    40       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL ZGEMV( 'C', M, M, CONE, WORK( IL ), LDWORK, B( 1, 1 ),
      $                  1, CZERO, WORK( IWORK ), 1 )
             CALL ZCOPY( M, WORK( IWORK ), 1, B( 1, 1 ), 1 )
@@ -737,7 +735,7 @@
      $                     LDB, CZERO, WORK, N )
                CALL ZLACPY( 'F', N, BL, WORK, N, B( 1, I ), LDB )
    60       CONTINUE
-         ELSE
+         ELSE IF( NRHS.EQ.1 ) THEN
             CALL ZGEMV( 'C', M, N, CONE, A, LDA, B, 1, CZERO, WORK, 1 )
             CALL ZCOPY( N, WORK, 1, B, 1 )
          END IF
diff --git a/lapack-netlib/SRC/zgelsy.f b/lapack-netlib/SRC/zgelsy.f
index 65fa87ae9..497becf8b 100644
--- a/lapack-netlib/SRC/zgelsy.f
+++ b/lapack-netlib/SRC/zgelsy.f
@@ -116,6 +116,7 @@
 *>          B is COMPLEX*16 array, dimension (LDB,NRHS)
 *>          On entry, the M-by-NRHS right hand side matrix B.
 *>          On exit, the N-by-NRHS solution matrix X.
+*>          If M = 0 or N = 0, B is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDB
@@ -148,6 +149,7 @@
 *>          The effective rank of A, i.e., the order of the submatrix
 *>          R11.  This is the same as the order of the submatrix T11
 *>          in the complete orthogonal factorization of A.
+*>          If NRHS = 0, RANK = 0 on output.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f
index 41cd1c059..11489087a 100644
--- a/lapack-netlib/SRC/zgemlq.f
+++ b/lapack-netlib/SRC/zgemlq.f
@@ -109,16 +109,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -142,7 +143,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -158,11 +159,13 @@
 *>  block sizes MB and NB returned by ILAENV, ZGELQ will use either
 *>  ZLASWLQ (if the matrix is wide-and-short) or ZGELQT to compute
 *>  the LQ factorization.
-*>  This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to 
+*>  This version of ZGEMLQ will use either ZLAMSWLQ or ZGEMLQT to
 *>  multiply matrix Q by another matrix.
 *>  Further Details in ZLAMSWLQ or ZGEMLQT.
 *> \endverbatim
 *>
+*> \ingroup gemlq
+*>
 *  =====================================================================
       SUBROUTINE ZGEMLQ( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -184,7 +187,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -200,7 +203,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -215,6 +218,13 @@
         LW = M * MB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( NB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, NB - K ) .EQ. 0 ) THEN
@@ -243,7 +253,7 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( ( LWORK.LT.LWMIN ) .AND. ( .NOT.LQUERY ) ) THEN
         INFO = -13
       END IF
 *
@@ -260,7 +270,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f
index c83eaff2f..d14d74fe2 100644
--- a/lapack-netlib/SRC/zgemqr.f
+++ b/lapack-netlib/SRC/zgemqr.f
@@ -111,16 +111,17 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1, then a workspace query is assumed. The routine
 *>          only calculates the size of the WORK array, returns this
-*>          value as WORK(1), and no error message related to WORK 
+*>          value as WORK(1), and no error message related to WORK
 *>          is issued by XERBLA.
 *> \endverbatim
 *>
@@ -144,7 +145,7 @@
 *>
 *> \verbatim
 *>
-*> These details are particular for this LAPACK implementation. Users should not 
+*> These details are particular for this LAPACK implementation. Users should not
 *> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
@@ -166,6 +167,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup gemqr
+*>
 *  =====================================================================
       SUBROUTINE ZGEMQR( SIDE, TRANS, M, N, K, A, LDA, T, TSIZE,
      $                   C, LDC, WORK, LWORK, INFO )
@@ -187,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER            MB, NB, LW, NBLCKS, MN
+      INTEGER            MB, NB, LW, NBLCKS, MN, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -203,7 +206,7 @@
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.EQ.-1
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
@@ -218,6 +221,13 @@
         LW = MB * NB
         MN = N
       END IF
+*
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
 *
       IF( ( MB.GT.K ) .AND. ( MN.GT.K ) ) THEN
         IF( MOD( MN - K, MB - K ).EQ.0 ) THEN
@@ -246,12 +256,12 @@
         INFO = -9
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
         INFO = -11
-      ELSE IF( ( LWORK.LT.MAX( 1, LW ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. .NOT.LQUERY ) THEN
         INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-        WORK( 1 ) = LW
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -263,7 +273,7 @@
 *
 *     Quick return if possible
 *
-      IF( MIN( M, N, K ).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
@@ -276,7 +286,7 @@
      $                 NB, C, LDC, WORK, LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = LW
+      WORK( 1 ) = LWMIN
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/zgeqlf.f b/lapack-netlib/SRC/zgeqlf.f
index 94721540c..a27612c64 100644
--- a/lapack-netlib/SRC/zgeqlf.f
+++ b/lapack-netlib/SRC/zgeqlf.f
@@ -88,7 +88,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -113,7 +114,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup geqlf
 *
 *> \par Further Details:
 *  =====================
@@ -188,8 +189,9 @@
          END IF
          WORK( 1 ) = LWKOPT
 *
-         IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
-            INFO = -7
+         IF( .NOT.LQUERY ) THEN
+            IF( LWORK.LE.0 .OR. ( M.GT.0 .AND. LWORK.LT.MAX( 1, N ) ) )
+     $         INFO = -7
          END IF
       END IF
 *
diff --git a/lapack-netlib/SRC/zgeqp3rk.c b/lapack-netlib/SRC/zgeqp3rk.c
new file mode 100644
index 000000000..0c8b41c2d
--- /dev/null
+++ b/lapack-netlib/SRC/zgeqp3rk.c
@@ -0,0 +1,1074 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+static integer c_n1 = -1;
+static integer c__3 = 3;
+static integer c__2 = 2;
+
+/* Subroutine */ int zgeqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*kmax, doublereal *abstol, doublereal *reltol, doublecomplex *a, 
+	integer *lda, integer *k, doublereal *maxc2nrmk, doublereal *
+	relmaxc2nrmk, integer *jpiv, doublecomplex *tau, doublecomplex *work, 
+	integer *lwork, doublereal *rwork, integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2;
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal maxc2nrm;
+    logical done;
+    extern /* Subroutine */ int zlaqp2rk_(integer *, integer *, integer *, 
+	    integer *, integer *, doublereal *, doublereal *, integer *, 
+	    doublereal *, doublecomplex *, integer *, integer *, doublereal *,
+	     doublereal *, integer *, doublecomplex *, doublereal *, 
+	    doublereal *, doublecomplex *, integer *), zlaqp3rk_(integer *, 
+	    integer *, integer *, integer *, integer *, doublereal *, 
+	    doublereal *, integer *, doublereal *, doublecomplex *, integer *,
+	     logical *, integer *, doublereal *, doublereal *, integer *, 
+	    doublecomplex *, doublereal *, doublereal *, doublecomplex *, 
+	    doublecomplex *, integer *, integer *, integer *);
+    integer jmax, j, jmaxc2nrm, jmaxb, nbmin, iinfo, n_sub__, minmn;
+    doublereal myhugeval;
+    integer jb;
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    integer nb, kf;
+    extern doublereal dlamch_(char *);
+    extern integer idamax_(integer *, doublereal *, integer *);
+    integer nx;
+    doublereal safmin;
+    extern /* Subroutine */ int xerbla_(char *, integer *);
+    extern logical disnan_(doublereal *);
+    extern integer ilaenv_(integer *, char *, char *, integer *, integer *, 
+	    integer *, integer *, ftnlen, ftnlen);
+    integer kp1, lwkopt;
+    logical lquery;
+    integer jbf;
+    doublereal eps;
+    integer iws, ioffset;
+
+
+/*  -- LAPACK computational routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Test input arguments */
+/*     ==================== */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --work;
+    --rwork;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+    lquery = *lwork == -1;
+    if (*m < 0) {
+	*info = -1;
+    } else if (*n < 0) {
+	*info = -2;
+    } else if (*nrhs < 0) {
+	*info = -3;
+    } else if (*kmax < 0) {
+	*info = -4;
+    } else if (disnan_(abstol)) {
+	*info = -5;
+    } else if (disnan_(reltol)) {
+	*info = -6;
+    } else if (*lda < f2cmax(1,*m)) {
+	*info = -8;
+    }
+
+/*     If the input parameters M, N, NRHS, KMAX, LDA are valid: */
+/*       a) Test the input workspace size LWORK for the minimum */
+/*          size requirement IWS. */
+/*       b) Determine the optimal block size NB and optimal */
+/*          workspace size LWKOPT to be returned in WORK(1) */
+/*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE., */
+/*          (3) when routine exits. */
+/*     Here, IWS is the miminum workspace required for unblocked */
+/*     code. */
+
+    if (*info == 0) {
+	minmn = f2cmin(*m,*n);
+	if (minmn == 0) {
+	    iws = 1;
+	    lwkopt = 1;
+	} else {
+
+/*           Minimal workspace size in case of using only unblocked */
+/*           BLAS 2 code in ZLAQP2RK. */
+/*           1) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in ZLARF subroutine inside ZLAQP2RK to apply an */
+/*              elementary reflector from the left. */
+/*           TOTAL_WORK_SIZE = 3*N + NRHS - 1 */
+
+	    iws = *n + *nrhs - 1;
+
+/*           Assign to NB optimal block size. */
+
+	    nb = ilaenv_(&c__1, "ZGEQP3RK", " ", m, n, &c_n1, &c_n1, (ftnlen)
+		    8, (ftnlen)1);
+
+/*           A formula for the optimal workspace size in case of using */
+/*           both unblocked BLAS 2 in ZLAQP2RK and blocked BLAS 3 code */
+/*           in ZLAQP3RK. */
+/*           1) ZGEQP3RK, ZLAQP2RK, ZLAQP3RK: 2*N to store full and */
+/*              partial column 2-norms. */
+/*           2) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used */
+/*              in ZLARF subroutine to apply an elementary reflector */
+/*              from the left. */
+/*           3) ZLAQP3RK: NB*(N+NRHS) to use in the work array F that */
+/*              is used to apply a block reflector from */
+/*              the left. */
+/*           4) ZLAQP3RK: NB to use in the auxilixary array AUX. */
+/*           Sizes (2) and ((3) + (4)) should intersect, therefore */
+/*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2. */
+
+	    lwkopt = (*n << 1) + nb * (*n + *nrhs + 1);
+	}
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+
+	if (*lwork < iws && ! lquery) {
+	    *info = -15;
+	}
+    }
+
+/*      NOTE: The optimal workspace size is returned in WORK(1), if */
+/*            the input parameters M, N, NRHS, KMAX, LDA are valid. */
+
+    if (*info != 0) {
+	i__1 = -(*info);
+	xerbla_("ZGEQP3RK", &i__1);
+	return 0;
+    } else if (lquery) {
+	return 0;
+    }
+
+/*     Quick return if possible for M=0 or N=0. */
+
+    if (minmn == 0) {
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize column pivot array JPIV. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	jpiv[j] = j;
+    }
+
+/*     ================================================================== */
+
+/*     Initialize storage for partial and exact column 2-norms. */
+/*     a) The elements WORK(1:N) are used to store partial column */
+/*        2-norms of the matrix A, and may decrease in each computation */
+/*        step; initialize to the values of complete columns 2-norms. */
+/*     b) The elements WORK(N+1:2*N) are used to store complete column */
+/*        2-norms of the matrix A, they are not changed during the */
+/*        computation; initialize the values of complete columns 2-norms. */
+
+    i__1 = *n;
+    for (j = 1; j <= i__1; ++j) {
+	rwork[j] = dznrm2_(m, &a[j * a_dim1 + 1], &c__1);
+	rwork[*n + j] = rwork[j];
+    }
+
+/*     ================================================================== */
+
+/*     Compute the pivot column index and the maximum column 2-norm */
+/*     for the whole original matrix stored in A(1:M,1:N). */
+
+    kp1 = idamax_(n, &rwork[1], &c__1);
+
+/*     ==================================================================. */
+
+    if (disnan_(&maxc2nrm)) {
+
+/*        Check if the matrix A contains NaN, set INFO parameter */
+/*        to the column number where the first NaN is found and return */
+/*        from the routine. */
+
+	*k = 0;
+	*info = kp1;
+
+/*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = maxc2nrm;
+
+/*        Array TAU is not set and contains undefined elements. */
+
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     =================================================================== */
+
+    if (maxc2nrm == 0.) {
+
+/*        Check is the matrix A is a zero matrix, set array TAU and */
+/*        return from the routine. */
+
+	*k = 0;
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+
+    }
+
+/*     =================================================================== */
+
+    myhugeval = dlamch_("Overflow");
+
+    if (maxc2nrm > myhugeval) {
+
+/*        Check if the matrix A contains +Inf or -Inf, set INFO parameter */
+/*        to the column number, where the first +/-Inf  is found plus N, */
+/*        and continue the computation. */
+
+	*info = *n + kp1;
+
+    }
+
+/*     ================================================================== */
+
+/*     Quick return if possible for the case when the first */
+/*     stopping criterion is satisfied, i.e. KMAX = 0. */
+
+    if (*kmax == 0) {
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+
+    eps = dlamch_("Epsilon");
+
+/*     Adjust ABSTOL */
+
+    if (*abstol >= 0.) {
+	safmin = dlamch_("Safe minimum");
+/* Computing MAX */
+	d__1 = *abstol, d__2 = safmin * 2.;
+	*abstol = f2cmax(d__1,d__2);
+    }
+
+/*     Adjust RELTOL */
+
+    if (*reltol >= 0.) {
+	*reltol = f2cmax(*reltol,eps);
+    }
+
+/*     =================================================================== */
+
+/*     JMAX is the maximum index of the column to be factorized, */
+/*     which is also limited by the first stopping criterion KMAX. */
+
+    jmax = f2cmin(*kmax,minmn);
+
+/*     =================================================================== */
+
+/*     Quick return if possible for the case when the second or third */
+/*     stopping criterion for the whole original matrix is satified, */
+/*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL */
+/*     (which is ONE <= RELTOL). */
+
+    if (maxc2nrm <= *abstol || 1. <= *reltol) {
+
+	*k = 0;
+	*maxc2nrmk = maxc2nrm;
+	*relmaxc2nrmk = 1.;
+
+	i__1 = minmn;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+
+	z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+	work[1].r = z__1.r, work[1].i = z__1.i;
+	return 0;
+    }
+
+/*     ================================================================== */
+/*     Factorize columns */
+/*     ================================================================== */
+
+/*     Determine the block size. */
+
+    nbmin = 2;
+    nx = 0;
+
+    if (nb > 1 && nb < minmn) {
+
+/*        Determine when to cross over from blocked to unblocked code. */
+/*        (for N less than NX, unblocked code should be used). */
+
+/* Computing MAX */
+	i__1 = 0, i__2 = ilaenv_(&c__3, "ZGEQP3RK", " ", m, n, &c_n1, &c_n1, (
+		ftnlen)8, (ftnlen)1);
+	nx = f2cmax(i__1,i__2);
+
+	if (nx < minmn) {
+
+/*           Determine if workspace is large enough for blocked code. */
+
+	    if (*lwork < lwkopt) {
+
+/*              Not enough workspace to use optimal block size that */
+/*              is currently stored in NB. */
+/*              Reduce NB and determine the minimum value of NB. */
+
+		nb = (*lwork - (*n << 1)) / (*n + 1);
+/* Computing MAX */
+		i__1 = 2, i__2 = ilaenv_(&c__2, "ZGEQP3RK", " ", m, n, &c_n1, 
+			&c_n1, (ftnlen)8, (ftnlen)1);
+		nbmin = f2cmax(i__1,i__2);
+
+	    }
+	}
+    }
+
+/*     ================================================================== */
+
+/*     DONE is the boolean flag to rerpresent the case when the */
+/*     factorization completed in the block factorization routine, */
+/*     before the end of the block. */
+
+    done = FALSE_;
+
+/*     J is the column index. */
+
+    j = 1;
+
+/*     (1) Use blocked code initially. */
+
+/*     JMAXB is the maximum column index of the block, when the */
+/*     blocked code is used, is also limited by the first stopping */
+/*     criterion KMAX. */
+
+/* Computing MIN */
+    i__1 = *kmax, i__2 = minmn - nx;
+    jmaxb = f2cmin(i__1,i__2);
+
+    if (nb >= nbmin && nb < jmax && jmaxb > 0) {
+
+/*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here: */
+/*        J   is the column index of a column block; */
+/*        JB  is the column block size to pass to block factorization */
+/*            routine in a loop step; */
+/*        JBF is the number of columns that were actually factorized */
+/*            that was returned by the block factorization routine */
+/*            in a loop step, JBF <= JB; */
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	while(j <= jmaxb) {
+
+/* Computing MIN */
+	    i__1 = nb, i__2 = jmaxb - j + 1;
+	    jb = f2cmin(i__1,i__2);
+	    n_sub__ = *n - j + 1;
+	    ioffset = j - 1;
+
+/*           Factorize JB columns among the columns A(J:N). */
+
+	    i__1 = *n + *nrhs - j + 1;
+	    zlaqp3rk_(m, &n_sub__, nrhs, &ioffset, &jb, abstol, reltol, &kp1, 
+		    &maxc2nrm, &a[j * a_dim1 + 1], lda, &done, &jbf, 
+		    maxc2nrmk, relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &
+		    rwork[*n + j], &work[1], &work[jb + 1], &i__1, &iwork[1], 
+		    &iinfo);
+
+/*           Set INFO on the first occurence of Inf. */
+
+	    if (iinfo > n_sub__ && *info == 0) {
+		*info = (ioffset << 1) + iinfo;
+	    }
+
+	    if (done) {
+
+/*              Either the submatrix is zero before the end of the */
+/*              column block, or ABSTOL or RELTOL criterion is */
+/*              satisfied before the end of the column block, we can */
+/*              return from the routine. Perform the following before */
+/*              returning: */
+/*                a) Set the number of factorized columns K, */
+/*                   K = IOFFSET + JBF from the last call of blocked */
+/*                   routine. */
+/*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned */
+/*                         by the block factorization routine; */
+/*                      2) The remaining TAUs are set to ZERO by the */
+/*                         block factorization routine. */
+
+		*k = ioffset + jbf;
+
+/*              Set INFO on the first occurrence of NaN, NaN takes */
+/*              prcedence over Inf. */
+
+		if (iinfo <= n_sub__ && iinfo > 0) {
+		    *info = ioffset + iinfo;
+		}
+
+/*              Return from the routine. */
+
+		z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+		work[1].r = z__1.r, work[1].i = z__1.i;
+
+		return 0;
+
+	    }
+
+	    j += jbf;
+
+	}
+
+    }
+
+/*     Use unblocked code to factor the last or only block. */
+/*     J = JMAX+1 means we factorized the maximum possible number of */
+/*     columns, that is in ELSE clause we need to compute */
+/*     the MAXC2NORM and RELMAXC2NORM to return after we processed */
+/*     the blocks. */
+
+    if (j <= jmax) {
+
+/*        N_SUB is the number of columns in the submatrix; */
+/*        IOFFSET is the number of rows that should not be factorized. */
+
+	n_sub__ = *n - j + 1;
+	ioffset = j - 1;
+
+	i__1 = jmax - j + 1;
+	zlaqp2rk_(m, &n_sub__, nrhs, &ioffset, &i__1, abstol, reltol, &kp1, &
+		maxc2nrm, &a[j * a_dim1 + 1], lda, &kf, maxc2nrmk, 
+		relmaxc2nrmk, &jpiv[j], &tau[j], &rwork[j], &rwork[*n + j], &
+		work[1], &iinfo);
+
+/*        ABSTOL or RELTOL criterion is satisfied when the number of */
+/*        the factorized columns KF is smaller then the  number */
+/*        of columns JMAX-J+1 supplied to be factorized by the */
+/*        unblocked routine, we can return from */
+/*        the routine. Perform the following before returning: */
+/*           a) Set the number of factorized columns K, */
+/*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the */
+/*              unblocked factorization routine above. */
+
+	*k = j - 1 + kf;
+
+/*        Set INFO on the first exception occurence. */
+
+/*        Set INFO on the first exception occurence of Inf or NaN, */
+/*        (NaN takes precedence over Inf). */
+
+	if (iinfo > n_sub__ && *info == 0) {
+	    *info = (ioffset << 1) + iinfo;
+	} else if (iinfo <= n_sub__ && iinfo > 0) {
+	    *info = ioffset + iinfo;
+	}
+
+    } else {
+
+/*        Compute the return values for blocked code. */
+
+/*        Set the number of factorized columns if the unblocked routine */
+/*        was not called. */
+
+	*k = jmax;
+
+/*        If there exits a residual matrix after the blocked code: */
+/*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the */
+/*              residual matrix, otherwise set them to ZERO; */
+/*           2) Set TAU(K+1:MINMN) to ZERO. */
+
+	if (*k < minmn) {
+	    i__1 = *n - *k;
+	    jmaxc2nrm = *k + idamax_(&i__1, &rwork[*k + 1], &c__1);
+	    *maxc2nrmk = rwork[jmaxc2nrm];
+	    if (*k == 0) {
+		*relmaxc2nrmk = 1.;
+	    } else {
+		*relmaxc2nrmk = *maxc2nrmk / maxc2nrm;
+	    }
+
+	    i__1 = minmn;
+	    for (j = *k + 1; j <= i__1; ++j) {
+		i__2 = j;
+		tau[i__2].r = 0., tau[i__2].i = 0.;
+	    }
+
+	} else {
+	    *maxc2nrmk = 0.;
+	    *relmaxc2nrmk = 0.;
+
+	}
+
+/*     END IF( J.LE.JMAX ) THEN */
+
+    }
+
+    z__1.r = (doublereal) lwkopt, z__1.i = 0.;
+    work[1].r = z__1.r, work[1].i = z__1.i;
+
+    return 0;
+
+/*     End of ZGEQP3RK */
+
+} /* zgeqp3rk_ */
+
diff --git a/lapack-netlib/SRC/zgeqp3rk.f b/lapack-netlib/SRC/zgeqp3rk.f
new file mode 100644
index 000000000..01dcce0de
--- /dev/null
+++ b/lapack-netlib/SRC/zgeqp3rk.f
@@ -0,0 +1,1092 @@
+*> \brief \b ZGEQP3RK computes a truncated Householder QR factorization with column pivoting of a complex m-by-n matrix A by using Level 3 BLAS and overwrites m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGEQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgeqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgeqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgeqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+*      $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*      $                     WORK, LWORK, RWORK, IWORK, INFO )
+*       IMPLICIT NONE
+*
+*      .. Scalar Arguments ..
+*       INTEGER            INFO, K, KMAX, LDA, LWORK, M, N, NRHS
+*       DOUBLE PRECISION   ABSTOL, MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*      ..
+*      .. Array Arguments ..
+*       INTEGER            IWORK( * ), JPIV( * )
+*       DOUBLE PRECISION   RWORK( * )
+*       COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*      ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGEQP3RK performs two tasks simultaneously:
+*>
+*> Task 1: The routine computes a truncated (rank K) or full rank
+*> Householder QR factorization with column pivoting of a complex
+*> M-by-N matrix A using Level 3 BLAS. K is the number of columns
+*> that were factorized, i.e. factorization rank of the
+*> factor R, K <= min(M,N).
+*>
+*>  A * P(K) = Q(K) * R(K)  =
+*>
+*>        = Q(K) * ( R11(K) R12(K) ) = Q(K) * (   R(K)_approx    )
+*>                 ( 0      R22(K) )          ( 0  R(K)_residual ),
+*>
+*> where:
+*>
+*>  P(K)            is an N-by-N permutation matrix;
+*>  Q(K)            is an M-by-M unitary matrix;
+*>  R(K)_approx   = ( R11(K), R12(K) ) is a rank K approximation of the
+*>                    full rank factor R with K-by-K upper-triangular
+*>                    R11(K) and K-by-N rectangular R12(K). The diagonal
+*>                    entries of R11(K) appear in non-increasing order
+*>                    of absolute value, and absolute values of all of
+*>                    them exceed the maximum column 2-norm of R22(K)
+*>                    up to roundoff error.
+*>  R(K)_residual = R22(K) is the residual of a rank K approximation
+*>                    of the full rank factor R. It is a
+*>                    an (M-K)-by-(N-K) rectangular matrix;
+*>  0               is a an (M-K)-by-K zero matrix.
+*>
+*> Task 2: At the same time, the routine overwrites a complex M-by-NRHS
+*> matrix B with  Q(K)**H * B  using Level 3 BLAS.
+*>
+*> =====================================================================
+*>
+*> The matrices A and B are stored on input in the array A as
+*> the left and right blocks A(1:M,1:N) and A(1:M, N+1:N+NRHS)
+*> respectively.
+*>
+*>                                  N     NRHS
+*>             array_A   =   M  [ mat_A, mat_B ]
+*>
+*> The truncation criteria (i.e. when to stop the factorization)
+*> can be any of the following:
+*>
+*>   1) The input parameter KMAX, the maximum number of columns
+*>      KMAX to factorize, i.e. the factorization rank is limited
+*>      to KMAX. If KMAX >= min(M,N), the criterion is not used.
+*>
+*>   2) The input parameter ABSTOL, the absolute tolerance for
+*>      the maximum column 2-norm of the residual matrix R22(K). This
+*>      means that the factorization stops if this norm is less or
+*>      equal to ABSTOL. If ABSTOL < 0.0, the criterion is not used.
+*>
+*>   3) The input parameter RELTOL, the tolerance for the maximum
+*>      column 2-norm matrix of the residual matrix R22(K) divided
+*>      by the maximum column 2-norm of the original matrix A, which
+*>      is equal to abs(R(1,1)). This means that the factorization stops
+*>      when the ratio of the maximum column 2-norm of R22(K) to
+*>      the maximum column 2-norm of A is less than or equal to RELTOL.
+*>      If RELTOL < 0.0, the criterion is not used.
+*>
+*>   4) In case both stopping criteria ABSTOL or RELTOL are not used,
+*>      and when the residual matrix R22(K) is a zero matrix in some
+*>      factorization step K. ( This stopping criterion is implicit. )
+*>
+*>  The algorithm stops when any of these conditions is first
+*>  satisfied, otherwise the whole matrix A is factorized.
+*>
+*>  To factorize the whole matrix A, use the values
+*>  KMAX >= min(M,N), ABSTOL < 0.0 and RELTOL < 0.0.
+*>
+*>  The routine returns:
+*>     a) Q(K), R(K)_approx = ( R11(K), R12(K) ),
+*>        R(K)_residual = R22(K), P(K), i.e. the resulting matrices
+*>        of the factorization; P(K) is represented by JPIV,
+*>        ( if K = min(M,N), R(K)_approx is the full factor R,
+*>        and there is no residual matrix R(K)_residual);
+*>     b) K, the number of columns that were factorized,
+*>        i.e. factorization rank;
+*>     c) MAXC2NRMK, the maximum column 2-norm of the residual
+*>        matrix R(K)_residual = R22(K),
+*>        ( if K = min(M,N), MAXC2NRMK = 0.0 );
+*>     d) RELMAXC2NRMK equals MAXC2NRMK divided by MAXC2NRM, the maximum
+*>        column 2-norm of the original matrix A, which is equal
+*>        to abs(R(1,1)), ( if K = min(M,N), RELMAXC2NRMK = 0.0 );
+*>     e) Q(K)**H * B, the matrix B with the unitary
+*>        transformation Q(K)**H applied on the left.
+*>
+*> The N-by-N permutation matrix P(K) is stored in a compact form in
+*> the integer array JPIV. For 1 <= j <= N, column j
+*> of the matrix A was interchanged with column JPIV(j).
+*>
+*> The M-by-M unitary matrix Q is represented as a product
+*> of elementary Householder reflectors
+*>
+*>     Q(K) = H(1) *  H(2) * . . . * H(K),
+*>
+*> where K is the number of columns that were factorized.
+*>
+*> Each H(j) has the form
+*>
+*>     H(j) = I - tau * v * v**H,
+*>
+*> where 1 <= j <= K and
+*>   I    is an M-by-M identity matrix,
+*>   tau  is a complex scalar,
+*>   v    is a complex vector with v(1:j-1) = 0 and v(j) = 1.
+*>
+*> v(j+1:M) is stored on exit in A(j+1:M,j) and tau in TAU(j).
+*>
+*> See the Further Details section for more information.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e. the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M,N), then this stopping criterion
+*>                is not used, the routine factorizes columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>                satisfied on input and the routine exits immediately.
+*>                This means that the factorization is not performed,
+*>                the matrices A and B are not modified, and
+*>                the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION
+*>
+*>          The second factorization stopping criterion, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix R22(K).
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix R22(K)
+*>          is less than or equal to ABSTOL. Let SAFMIN = DLAMCH('S').
+*>
+*>          a) If ABSTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -5 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          c) If 0.0 <= ABSTOL < 2*SAFMIN, then ABSTOL = 2*SAFMIN
+*>                is used. This includes the case ABSTOL = -0.0.
+*>
+*>          d) If 2*SAFMIN <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If ABSTOL chosen above is >= MAXC2NRM, then this
+*>          stopping criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed. The routine
+*>          returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case ABSTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION
+*>
+*>          The third factorization stopping criterion, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio
+*>          abs(R(K+1,K+1))/abs(R(1,1)) of the maximum column 2-norm of
+*>          the residual matrix R22(K) to the maximum column 2-norm of
+*>          the original matrix A. The algorithm converges (stops the
+*>          factorization), when abs(R(K+1,K+1))/abs(R(1,1)) A is less
+*>          than or equal to RELTOL. Let EPS = DLAMCH('E').
+*>
+*>          a) If RELTOL is NaN, then no computation is performed
+*>                and an error message ( INFO = -6 ) is issued
+*>                by XERBLA.
+*>
+*>          b) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          c) If 0.0 <= RELTOL < EPS, then RELTOL = EPS is used.
+*>                This includes the case RELTOL = -0.0.
+*>
+*>          d) If EPS <= RELTOL then the input value of RELTOL
+*>                is used.
+*>
+*>          Let MAXC2NRM be the maximum column 2-norm of the
+*>          whole original matrix A.
+*>          If RELTOL chosen above is >= 1.0, then this stopping
+*>          criterion is satisfied on input and routine exits
+*>          immediately after MAXC2NRM is computed.
+*>          The routine returns MAXC2NRM in MAXC2NORMK,
+*>          and 1.0 in RELMAXC2NORMK.
+*>          This includes the case RELTOL = +Inf. This means that the
+*>          factorization is not performed, the matrices A and B are not
+*>          modified, and the matrix A is itself the residual.
+*>
+*>          NOTE: We recommend that RELTOL satisfy
+*>                min( 10*max(M,N)*EPS, sqrt(EPS) ) <= RELTOL
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N+NRHS)
+*>
+*>          On entry:
+*>
+*>          a) The subarray A(1:M,1:N) contains the M-by-N matrix A.
+*>          b) The subarray A(1:M,N+1:N+NRHS) contains the M-by-NRHS
+*>             matrix B.
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>
+*>          a) The subarray A(1:M,1:N) contains parts of the factors
+*>             of the matrix A:
+*>
+*>            1) If K = 0, A(1:M,1:N) contains the original matrix A.
+*>            2) If K > 0, A(1:M,1:N) contains parts of the
+*>            factors:
+*>
+*>              1. The elements below the diagonal of the subarray
+*>                 A(1:M,1:K) together with TAU(1:K) represent the
+*>                 unitary matrix Q(K) as a product of K Householder
+*>                 elementary reflectors.
+*>
+*>              2. The elements on and above the diagonal of
+*>                 the subarray A(1:K,1:N) contain K-by-N
+*>                 upper-trapezoidal matrix
+*>                 R(K)_approx = ( R11(K), R12(K) ).
+*>                 NOTE: If K=min(M,N), i.e. full rank factorization,
+*>                       then R_approx(K) is the full factor R which
+*>                       is upper-trapezoidal. If, in addition, M>=N,
+*>                       then R is upper-triangular.
+*>
+*>              3. The subarray A(K+1:M,K+1:N) contains (M-K)-by-(N-K)
+*>                 rectangular matrix R(K)_residual = R22(K).
+*>
+*>          b) If NRHS > 0, the subarray A(1:M,N+1:N+NRHS) contains
+*>             the M-by-NRHS product Q(K)**H * B.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*>          This is the leading dimension for both matrices, A and B.
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*>
+*>          NOTE: If K = 0, a) the arrays A and B are not modified;
+*>                          b) the array TAU(1:min(M,N)) is set to ZERO,
+*>                             if the matrix A does not contain NaN,
+*>                             otherwise the elements TAU(1:min(M,N))
+*>                             are undefined;
+*>                          c) the elements of the array JPIV are set
+*>                             as follows: for j = 1:N, JPIV(j) = j.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix R22(K),
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then MAXC2NRMK equals the maximum column 2-norm
+*>             of the original matrix A.
+*>
+*>          b) If 0 < K < min(M,N), then MAXC2NRMK is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then MAXC2NRMK = 0.0.
+*>
+*>          NOTE: MAXC2NRMK in the factorization step K would equal
+*>                R(K+1,K+1) in the next factorization step K+1.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix R22(K) (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*>
+*>          a) If K = 0, i.e. the factorization was not performed,
+*>             the matrix A was not modified and is itself a residual
+*>             matrix, then RELMAXC2NRMK = 1.0.
+*>
+*>          b) If 0 < K < min(M,N), then
+*>                RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM is returned.
+*>
+*>          c) If K = min(M,N), i.e. the whole matrix A was
+*>             factorized and there is no residual matrix,
+*>             then RELMAXC2NRMK = 0.0.
+*>
+*>         NOTE: RELMAXC2NRMK in the factorization step K would equal
+*>               abs(R(K+1,K+1))/abs(R(1,1)) in the next factorization
+*>               step K+1.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices. For 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*>
+*>          The elements of the array JPIV(1:N) are always set
+*>          by the routine, for example, even  when no columns
+*>          were factorized, i.e. when K = 0, the elements are
+*>          set as JPIV(j) = j for j = 1:N.
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (min(M,N))
+*>          The scalar factors of the elementary reflectors.
+*>
+*>          If 0 < K <= min(M,N), only the elements TAU(1:K) of
+*>          the array TAU are modified by the factorization.
+*>          After the factorization computed, if no NaN was found
+*>          during the factorization, the remaining elements
+*>          TAU(K+1:min(M,N)) are set to zero, otherwise the
+*>          elements TAU(K+1:min(M,N)) are not set and therefore
+*>          undefined.
+*>          ( If K = 0, all elements of TAU are set to zero, if
+*>          the matrix A does not contain NaN. )
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and
+*>          LWORK >= N+NRHS-1, otherwise.
+*>          For optimal performance LWORK >= NB*( N+NRHS+1 ),
+*>          where NB is the optimal block size for ZGEQP3RK returned
+*>          by ILAENV. Minimal block size MINNB=2.
+*>
+*>          NOTE: The decision, whether to use unblocked BLAS 2
+*>          or blocked BLAS 3 code is based not only on the dimension
+*>          LWORK of the availbale workspace WORK, but also also on the
+*>          matrix A dimension N via crossover point NX returned
+*>          by ILAENV. (For N less than NX, unblocked code should be
+*>          used.)
+*>
+*>          If LWORK = -1, then a workspace query is assumed;
+*>          the routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix in the blocked step auxiliary subroutine ZLAQP3RK ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) INFO < 0: if INFO = -i, the i-th argument had an
+*>                       illegal value.
+*>          3) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          4) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup geqp3rk
+*
+*> \par Further Details:
+*  =====================
+*
+*> \verbatim
+*> ZGEQP3RK is based on the same BLAS3 Householder QR factorization
+*> algorithm with column pivoting as in ZGEQP3 routine which uses
+*> ZLARFG routine to generate Householder reflectors
+*> for QR factorization.
+*>
+*> We can also write:
+*>
+*>   A = A_approx(K) + A_residual(K)
+*>
+*> The low rank approximation matrix A(K)_approx from
+*> the truncated QR factorization of rank K of the matrix A is:
+*>
+*>   A(K)_approx = Q(K) * ( R(K)_approx ) * P(K)**T
+*>                        (     0     0 )
+*>
+*>               = Q(K) * ( R11(K) R12(K) ) * P(K)**T
+*>                        (      0      0 )
+*>
+*> The residual A_residual(K) of the matrix A is:
+*>
+*>   A_residual(K) = Q(K) * ( 0              0 ) * P(K)**T =
+*>                          ( 0  R(K)_residual )
+*>
+*>                 = Q(K) * ( 0        0 ) * P(K)**T
+*>                          ( 0   R22(K) )
+*>
+*> The truncated (rank K) factorization guarantees that
+*> the maximum column 2-norm of A_residual(K) is less than
+*> or equal to MAXC2NRMK up to roundoff error.
+*>
+*> NOTE: An approximation of the null vectors
+*>       of A can be easily computed from R11(K)
+*>       and R12(K):
+*>
+*>       Null( A(K) )_approx = P * ( inv(R11(K)) * R12(K) )
+*>                                 (         -I           )
+*>
+*> \endverbatim
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL, A, LDA,
+     $                     K, MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     WORK, LWORK, RWORK, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, K, KF, KMAX, LDA, LWORK, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL,  MAXC2NRMK, RELMAXC2NRMK, RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   RWORK( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            INB, INBMIN, IXOVER
+      PARAMETER          ( INB = 1, INBMIN = 2, IXOVER = 3 )
+      DOUBLE PRECISION   ZERO, ONE, TWO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0, TWO = 2.0D+0 )
+      COMPLEX*16         CZERO
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY, DONE
+      INTEGER            IINFO, IOFFSET, IWS, J, JB, JBF, JMAXB, JMAX,
+     $                   JMAXC2NRM, KP1, LWKOPT, MINMN, N_SUB, NB,
+     $                   NBMIN, NX
+      DOUBLE PRECISION   EPS, HUGEVAL, MAXC2NRM, SAFMIN
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLAQP2RK, ZLAQP3RK, XERBLA
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX, ILAENV
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DISNAN, DLAMCH, DZNRM2, IDAMAX, ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test input arguments
+*     ====================
+*
+      INFO = 0
+      LQUERY = ( LWORK.EQ.-1 )
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KMAX.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( DISNAN( ABSTOL ) ) THEN
+         INFO = -5
+      ELSE IF( DISNAN( RELTOL ) ) THEN
+         INFO = -6
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -8
+      END IF
+*
+*     If the input parameters M, N, NRHS, KMAX, LDA are valid:
+*       a) Test the input workspace size LWORK for the minimum
+*          size requirement IWS.
+*       b) Determine the optimal block size NB and optimal
+*          workspace size LWKOPT to be returned in WORK(1)
+*          in case of (1) LWORK < IWS, (2) LQUERY = .TRUE.,
+*          (3) when routine exits.
+*     Here, IWS is the miminum workspace required for unblocked
+*     code.
+*
+      IF( INFO.EQ.0 ) THEN
+         MINMN = MIN( M, N )
+         IF( MINMN.EQ.0 ) THEN
+            IWS = 1
+            LWKOPT = 1
+         ELSE
+*
+*           Minimal workspace size in case of using only unblocked
+*           BLAS 2 code in ZLAQP2RK.
+*           1) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in ZLARF subroutine inside ZLAQP2RK to apply an
+*              elementary reflector from the left.
+*           TOTAL_WORK_SIZE = 3*N + NRHS - 1
+*
+            IWS = N + NRHS - 1
+*
+*           Assign to NB optimal block size.
+*
+            NB = ILAENV( INB, 'ZGEQP3RK', ' ', M, N, -1, -1 )
+*
+*           A formula for the optimal workspace size in case of using
+*           both unblocked BLAS 2 in ZLAQP2RK and blocked BLAS 3 code
+*           in ZLAQP3RK.
+*           1) ZGEQP3RK, ZLAQP2RK, ZLAQP3RK: 2*N to store full and
+*              partial column 2-norms.
+*           2) ZLAQP2RK: N+NRHS-1 to use in WORK array that is used
+*              in ZLARF subroutine to apply an elementary reflector
+*              from the left.
+*           3) ZLAQP3RK: NB*(N+NRHS) to use in the work array F that
+*              is used to apply a block reflector from
+*              the left.
+*           4) ZLAQP3RK: NB to use in the auxilixary array AUX.
+*           Sizes (2) and ((3) + (4)) should intersect, therefore
+*           TOTAL_WORK_SIZE = 2*N + NB*( N+NRHS+1 ), given NBMIN=2.
+*
+            LWKOPT = 2*N + NB*( N+NRHS+1 )
+         END IF
+         WORK( 1 ) = DCMPLX( LWKOPT )
+*
+         IF( ( LWORK.LT.IWS ) .AND. .NOT.LQUERY ) THEN
+            INFO = -15
+         END IF
+      END IF
+*
+*      NOTE: The optimal workspace size is returned in WORK(1), if
+*            the input parameters M, N, NRHS, KMAX, LDA are valid.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGEQP3RK', -INFO )
+         RETURN
+      ELSE IF( LQUERY ) THEN
+         RETURN
+      END IF
+*
+*     Quick return if possible for M=0 or N=0.
+*
+      IF( MINMN.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+*     Initialize column pivot array JPIV.
+*
+      DO J = 1, N
+         JPIV( J ) = J
+      END DO
+*
+*     ==================================================================
+*
+*     Initialize storage for partial and exact column 2-norms.
+*     a) The elements WORK(1:N) are used to store partial column
+*        2-norms of the matrix A, and may decrease in each computation
+*        step; initialize to the values of complete columns 2-norms.
+*     b) The elements WORK(N+1:2*N) are used to store complete column
+*        2-norms of the matrix A, they are not changed during the
+*        computation; initialize the values of complete columns 2-norms.
+*
+      DO J = 1, N
+         RWORK( J ) = DZNRM2( M, A( 1, J ), 1 )
+         RWORK( N+J ) = RWORK( J )
+      END DO
+*
+*     ==================================================================
+*
+*     Compute the pivot column index and the maximum column 2-norm
+*     for the whole original matrix stored in A(1:M,1:N).
+*
+      KP1 = IDAMAX( N, RWORK( 1 ), 1 )
+*
+*     ==================================================================.
+*
+      IF( DISNAN( MAXC2NRM ) ) THEN
+*
+*        Check if the matrix A contains NaN, set INFO parameter
+*        to the column number where the first NaN is found and return
+*        from the routine.
+*
+         K = 0
+         INFO = KP1
+*
+*        Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = MAXC2NRM
+*
+*        Array TAU is not set and contains undefined elements.
+*
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ===================================================================
+*
+      IF( MAXC2NRM.EQ.ZERO ) THEN
+*
+*        Check is the matrix A is a zero matrix, set array TAU and
+*        return from the routine.
+*
+         K = 0
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+*
+      END IF
+*
+*     ===================================================================
+*
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+      IF( MAXC2NRM.GT.HUGEVAL ) THEN
+*
+*        Check if the matrix A contains +Inf or -Inf, set INFO parameter
+*        to the column number, where the first +/-Inf  is found plus N,
+*        and continue the computation.
+*
+         INFO = N + KP1
+*
+      END IF
+*
+*     ==================================================================
+*
+*     Quick return if possible for the case when the first
+*     stopping criterion is satisfied, i.e. KMAX = 0.
+*
+      IF( KMAX.EQ.0 ) THEN
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*
+      EPS = DLAMCH('Epsilon')
+*
+*     Adjust ABSTOL
+*
+      IF( ABSTOL.GE.ZERO ) THEN
+         SAFMIN = DLAMCH('Safe minimum')
+         ABSTOL = MAX( ABSTOL, TWO*SAFMIN )
+      END IF
+*
+*     Adjust RELTOL
+*
+      IF( RELTOL.GE.ZERO ) THEN
+         RELTOL = MAX( RELTOL, EPS )
+      END IF
+*
+*     ===================================================================
+*
+*     JMAX is the maximum index of the column to be factorized,
+*     which is also limited by the first stopping criterion KMAX.
+*
+      JMAX = MIN( KMAX, MINMN )
+*
+*     ===================================================================
+*
+*     Quick return if possible for the case when the second or third
+*     stopping criterion for the whole original matrix is satified,
+*     i.e. MAXC2NRM <= ABSTOL or RELMAXC2NRM <= RELTOL
+*     (which is ONE <= RELTOL).
+*
+      IF( MAXC2NRM.LE.ABSTOL .OR. ONE.LE.RELTOL ) THEN
+*
+         K = 0
+         MAXC2NRMK = MAXC2NRM
+         RELMAXC2NRMK = ONE
+*
+         DO J = 1, MINMN
+            TAU( J ) = CZERO
+         END DO
+*
+         WORK( 1 ) = DCMPLX( LWKOPT )
+         RETURN
+      END IF
+*
+*     ==================================================================
+*     Factorize columns
+*     ==================================================================
+*
+*     Determine the block size.
+*
+      NBMIN = 2
+      NX = 0
+*
+      IF( ( NB.GT.1 ) .AND. ( NB.LT.MINMN ) ) THEN
+*
+*        Determine when to cross over from blocked to unblocked code.
+*        (for N less than NX, unblocked code should be used).
+*
+         NX = MAX( 0, ILAENV( IXOVER, 'ZGEQP3RK', ' ', M, N, -1, -1 ) )
+*
+         IF( NX.LT.MINMN ) THEN
+*
+*           Determine if workspace is large enough for blocked code.
+*
+            IF( LWORK.LT.LWKOPT ) THEN
+*
+*              Not enough workspace to use optimal block size that
+*              is currently stored in NB.
+*              Reduce NB and determine the minimum value of NB.
+*
+               NB = ( LWORK-2*N ) / ( N+1 )
+               NBMIN = MAX( 2, ILAENV( INBMIN, 'ZGEQP3RK', ' ', M, N,
+     $                 -1, -1 ) )
+*
+            END IF
+         END IF
+      END IF
+*
+*     ==================================================================
+*
+*     DONE is the boolean flag to rerpresent the case when the
+*     factorization completed in the block factorization routine,
+*     before the end of the block.
+*
+      DONE = .FALSE.
+*
+*     J is the column index.
+*
+      J = 1
+*
+*     (1) Use blocked code initially.
+*
+*     JMAXB is the maximum column index of the block, when the
+*     blocked code is used, is also limited by the first stopping
+*     criterion KMAX.
+*
+      JMAXB = MIN( KMAX, MINMN - NX )
+*
+      IF( NB.GE.NBMIN .AND. NB.LT.JMAX .AND. JMAXB.GT.0 ) THEN
+*
+*        Loop over the column blocks of the matrix A(1:M,1:JMAXB). Here:
+*        J   is the column index of a column block;
+*        JB  is the column block size to pass to block factorization
+*            routine in a loop step;
+*        JBF is the number of columns that were actually factorized
+*            that was returned by the block factorization routine
+*            in a loop step, JBF <= JB;
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         DO WHILE( J.LE.JMAXB )
+*
+            JB = MIN( NB, JMAXB-J+1 )
+            N_SUB = N-J+1
+            IOFFSET = J-1
+*
+*           Factorize JB columns among the columns A(J:N).
+*
+            CALL ZLAQP3RK( M, N_SUB, NRHS, IOFFSET, JB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                     DONE, JBF, MAXC2NRMK, RELMAXC2NRMK,
+     $                     JPIV( J ), TAU( J ),
+     $                     RWORK( J ), RWORK( N+J ),
+     $                     WORK( 1 ), WORK( JB+1 ),
+     $                     N+NRHS-J+1, IWORK, IINFO )
+*
+*           Set INFO on the first occurence of Inf.
+*
+            IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+               INFO = 2*IOFFSET + IINFO
+            END IF
+*
+            IF( DONE ) THEN
+*
+*              Either the submatrix is zero before the end of the
+*              column block, or ABSTOL or RELTOL criterion is
+*              satisfied before the end of the column block, we can
+*              return from the routine. Perform the following before
+*              returning:
+*                a) Set the number of factorized columns K,
+*                   K = IOFFSET + JBF from the last call of blocked
+*                   routine.
+*                NOTE: 1) MAXC2NRMK and RELMAXC2NRMK are returned
+*                         by the block factorization routine;
+*                      2) The remaining TAUs are set to ZERO by the
+*                         block factorization routine.
+*
+               K = IOFFSET + JBF
+*
+*              Set INFO on the first occurrence of NaN, NaN takes
+*              prcedence over Inf.
+*
+               IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+                  INFO = IOFFSET + IINFO
+               END IF
+*
+*              Return from the routine.
+*
+               WORK( 1 ) = DCMPLX( LWKOPT )
+*
+               RETURN
+*
+            END IF
+*
+            J = J + JBF
+*
+         END DO
+*
+      END IF
+*
+*     Use unblocked code to factor the last or only block.
+*     J = JMAX+1 means we factorized the maximum possible number of
+*     columns, that is in ELSE clause we need to compute
+*     the MAXC2NORM and RELMAXC2NORM to return after we processed
+*     the blocks.
+*
+      IF( J.LE.JMAX ) THEN
+*
+*        N_SUB is the number of columns in the submatrix;
+*        IOFFSET is the number of rows that should not be factorized.
+*
+         N_SUB = N-J+1
+         IOFFSET = J-1
+*
+         CALL ZLAQP2RK( M, N_SUB, NRHS, IOFFSET, JMAX-J+1,
+     $                  ABSTOL, RELTOL, KP1, MAXC2NRM, A( 1, J ), LDA,
+     $                  KF, MAXC2NRMK, RELMAXC2NRMK, JPIV( J ),
+     $                  TAU( J ), RWORK( J ), RWORK( N+J ),
+     $                  WORK( 1 ), IINFO )
+*
+*        ABSTOL or RELTOL criterion is satisfied when the number of
+*        the factorized columns KF is smaller then the  number
+*        of columns JMAX-J+1 supplied to be factorized by the
+*        unblocked routine, we can return from
+*        the routine. Perform the following before returning:
+*           a) Set the number of factorized columns K,
+*           b) MAXC2NRMK and RELMAXC2NRMK are returned by the
+*              unblocked factorization routine above.
+*
+         K = J - 1 + KF
+*
+*        Set INFO on the first exception occurence.
+*
+*        Set INFO on the first exception occurence of Inf or NaN,
+*        (NaN takes precedence over Inf).
+*
+         IF( IINFO.GT.N_SUB .AND. INFO.EQ.0 ) THEN
+            INFO = 2*IOFFSET + IINFO
+         ELSE IF( IINFO.LE.N_SUB .AND. IINFO.GT.0 ) THEN
+            INFO = IOFFSET + IINFO
+         END IF
+*
+      ELSE
+*
+*        Compute the return values for blocked code.
+*
+*        Set the number of factorized columns if the unblocked routine
+*        was not called.
+*
+            K = JMAX
+*
+*        If there exits a residual matrix after the blocked code:
+*           1) compute the values of MAXC2NRMK, RELMAXC2NRMK of the
+*              residual matrix, otherwise set them to ZERO;
+*           2) Set TAU(K+1:MINMN) to ZERO.
+*
+         IF( K.LT.MINMN ) THEN
+            JMAXC2NRM = K + IDAMAX( N-K, RWORK( K+1 ), 1 )
+            MAXC2NRMK = RWORK( JMAXC2NRM )
+            IF( K.EQ.0 ) THEN
+               RELMAXC2NRMK = ONE
+            ELSE
+               RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+            END IF
+*
+            DO J = K + 1, MINMN
+               TAU( J ) = CZERO
+            END DO
+*
+         ELSE
+            MAXC2NRMK = ZERO
+            RELMAXC2NRMK = ZERO
+*
+         END IF
+*
+*     END IF( J.LE.JMAX ) THEN
+*
+      END IF
+*
+      WORK( 1 ) = DCMPLX( LWKOPT )
+*
+      RETURN
+*
+*     End of ZGEQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f
index 20a80d083..7df9c2403 100644
--- a/lapack-netlib/SRC/zgeqr.f
+++ b/lapack-netlib/SRC/zgeqr.f
@@ -99,7 +99,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed. The routine
 *>          only calculates the sizes of the T and WORK arrays, returns these
 *>          values as the first entries of the T and WORK arrays, and no error
@@ -168,6 +168,8 @@
 *>
 *> \endverbatim
 *>
+*> \ingroup geqr
+*>
 *  =====================================================================
       SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
@@ -188,7 +190,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, LMINWS, MINT, MINW
-      INTEGER            MB, NB, MINTSZ, NBLCKS
+      INTEGER            MB, NB, MINTSZ, NBLCKS, LWMIN, LWREQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -244,8 +246,10 @@
 *
 *     Determine if the workspace size satisfies minimal size
 *
+      LWMIN = MAX( 1, N )
+      LWREQ = MAX( 1, N*NB )
       LMINWS = .FALSE.
-      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.NB*N )
+      IF( ( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) .OR. LWORK.LT.LWREQ )
      $    .AND. ( LWORK.GE.N ) .AND. ( TSIZE.GE.MINTSZ )
      $    .AND. ( .NOT.LQUERY ) ) THEN
         IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 ) ) THEN
@@ -253,7 +257,7 @@
           NB = 1
           MB = M
         END IF
-        IF( LWORK.LT.NB*N ) THEN
+        IF( LWORK.LT.LWREQ ) THEN
           LMINWS = .TRUE.
           NB = 1
         END IF
@@ -268,7 +272,7 @@
       ELSE IF( TSIZE.LT.MAX( 1, NB*N*NBLCKS + 5 )
      $   .AND. ( .NOT.LQUERY ) .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -6
-      ELSE IF( ( LWORK.LT.MAX( 1, N*NB ) ) .AND. ( .NOT.LQUERY )
+      ELSE IF( ( LWORK.LT.LWREQ ) .AND. ( .NOT.LQUERY )
      $   .AND. ( .NOT.LMINWS ) ) THEN
         INFO = -8
       END IF
@@ -282,9 +286,9 @@
         T( 2 ) = MB
         T( 3 ) = NB
         IF( MINW ) THEN
-          WORK( 1 ) = MAX( 1, N )
+          WORK( 1 ) = LWMIN
         ELSE
-          WORK( 1 ) = MAX( 1, NB*N )
+          WORK( 1 ) = LWREQ
         END IF
       END IF
       IF( INFO.NE.0 ) THEN
@@ -309,7 +313,7 @@
      $                LWORK, INFO )
       END IF
 *
-      WORK( 1 ) = MAX( 1, NB*N )
+      WORK( 1 ) = LWREQ
 *
       RETURN
 *
diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f
index 73bcde667..3562de36e 100644
--- a/lapack-netlib/SRC/zgeqrfp.f
+++ b/lapack-netlib/SRC/zgeqrfp.f
@@ -97,7 +97,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  LWORK >= max(1,N).
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= N, otherwise.
 *>          For optimum performance LWORK >= N*NB, where NB is
 *>          the optimal blocksize.
 *>
@@ -122,7 +123,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup geqrfp
 *
 *> \par Further Details:
 *  =====================
@@ -162,8 +163,8 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKOPT, NB,
-     $                   NBMIN, NX
+      INTEGER            I, IB, IINFO, IWS, K, LDWORK, LWKMIN, LWKOPT,
+     $                   NB, NBMIN, NX
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           XERBLA, ZGEQR2P, ZLARFB, ZLARFT
@@ -181,8 +182,16 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZGEQRF', ' ', M, N, -1, -1 )
-      LWKOPT = N*NB
+      K = MIN( M, N )
+      IF( K.EQ.0 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = N
+         LWKOPT = N*NB
+      END IF
       WORK( 1 ) = LWKOPT
+*
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
@@ -190,7 +199,7 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
       IF( INFO.NE.0 ) THEN
@@ -202,7 +211,6 @@
 *
 *     Quick return if possible
 *
-      K = MIN( M, N )
       IF( K.EQ.0 ) THEN
          WORK( 1 ) = 1
          RETURN
@@ -210,7 +218,7 @@
 *
       NBMIN = 2
       NX = 0
-      IWS = N
+      IWS = LWKMIN
       IF( NB.GT.1 .AND. NB.LT.K ) THEN
 *
 *        Determine when to cross over from blocked to unblocked code.
diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f
index 30d18a3a0..7f203afa5 100644
--- a/lapack-netlib/SRC/zgesdd.f
+++ b/lapack-netlib/SRC/zgesdd.f
@@ -53,12 +53,6 @@
 *>
 *> Note that the routine returns VT = V**H, not V.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zgesvdq.f b/lapack-netlib/SRC/zgesvdq.f
index 05eb722f8..b990f7389 100644
--- a/lapack-netlib/SRC/zgesvdq.f
+++ b/lapack-netlib/SRC/zgesvdq.f
@@ -363,7 +363,7 @@
 *>   an optimal implementation would do all necessary scaling before calling
 *>   CGESVD and the scaling in CGESVD can be switched off.
 *>   3. Other comments related to code optimization are given in comments in the
-*>   code, enlosed in [[double brackets]].
+*>   code, enclosed in [[double brackets]].
 *> \endverbatim
 *
 *> \par Bugs, examples and comments
diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f
index 6cd2335f2..2be45d826 100644
--- a/lapack-netlib/SRC/zgesvj.f
+++ b/lapack-netlib/SRC/zgesvj.f
@@ -200,23 +200,25 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V, LDV >= 1.
-*>          If JOBV = 'V', then LDV >= max(1,N).
-*>          If JOBV = 'A', then LDV >= max(1,MV) .
+*>          If JOBV = 'V', then LDV >= MAX(1,N).
+*>          If JOBV = 'A', then LDV >= MAX(1,MV) .
 *> \endverbatim
 *>
 *> \param[in,out] CWORK
 *> \verbatim
-*>          CWORK is COMPLEX*16 array, dimension (max(1,LWORK))
+*>          CWORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          Used as workspace.
-*>          If on entry LWORK = -1, then a workspace query is assumed and
-*>          no computation is done; CWORK(1) is set to the minial (and optimal)
-*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER.
-*>          Length of CWORK, LWORK >= M+N.
+*>          Length of CWORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= M+N, otherwise.
+*>
+*>          If on entry LWORK = -1, then a workspace query is assumed and
+*>          no computation is done; CWORK(1) is set to the minial (and optimal)
+*>          length of CWORK.
 *> \endverbatim
 *>
 *> \param[in,out] RWORK
@@ -247,15 +249,17 @@
 *>          RWORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
-*>         If on entry LRWORK = -1, then a workspace query is assumed and
-*>         no computation is done; RWORK(1) is set to the minial (and optimal)
-*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[in] LRWORK
 *> \verbatim
 *>         LRWORK is INTEGER
-*>         Length of RWORK, LRWORK >= MAX(6,N).
+*>         Length of RWORK.
+*>         LRWORK >= 1, if MIN(M,N) = 0, and LRWORK >= MAX(6,N), otherwise.
+*>
+*>         If on entry LRWORK = -1, then a workspace query is assumed and
+*>         no computation is done; RWORK(1) is set to the minial (and optimal)
+*>         length of RWORK.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -276,7 +280,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup gesvj
 *
 *> \par Further Details:
 *  =====================
@@ -367,23 +371,25 @@
 *
 *     .. Local Parameters ..
       DOUBLE PRECISION   ZERO,         HALF,         ONE
-      PARAMETER  ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0)
-      COMPLEX*16      CZERO,                  CONE
-      PARAMETER  ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) )
-      INTEGER      NSWEEP
-      PARAMETER  ( NSWEEP = 30 )
+      PARAMETER        ( ZERO = 0.0D0, HALF = 0.5D0, ONE = 1.0D0)
+      COMPLEX*16         CZERO,                  CONE
+      PARAMETER        ( CZERO = (0.0D0, 0.0D0), CONE = (1.0D0, 0.0D0) )
+      INTEGER            NSWEEP
+      PARAMETER        ( NSWEEP = 30 )
 *     ..
 *     .. Local Scalars ..
-      COMPLEX*16 AAPQ, OMPQ
-      DOUBLE PRECISION AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
-     $       BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
-     $       MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
-     $       SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN, TOL
-      INTEGER BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
-     $        ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
-     $        N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP, SWBAND
-      LOGICAL APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE, ROTOK,
-     $        RSVEC, UCTOL, UPPER
+      COMPLEX*16         AAPQ, OMPQ
+      DOUBLE PRECISION   AAPP, AAPP0, AAPQ1, AAQQ, APOAQ, AQOAP, BIG,
+     $                   BIGTHETA, CS, CTOL, EPSLN, MXAAPQ,
+     $                   MXSINJ, ROOTBIG, ROOTEPS, ROOTSFMIN, ROOTTOL,
+     $                   SKL, SFMIN, SMALL, SN, T, TEMP1, THETA, THSIGN,
+     $                   TOL
+      INTEGER            BLSKIP, EMPTSW, i, ibr, IERR, igl, IJBLSK, ir1,
+     $                   ISWROT, jbc, jgl, KBL, LKAHEAD, MVL, N2, N34,
+     $                   N4, NBL, NOTROT, p, PSKIPPED, q, ROWSKIP,
+     $                   SWBAND, MINMN, LWMIN, LRWMIN
+      LOGICAL            APPLV, GOSCALE, LOWER, LQUERY, LSVEC, NOSCALE,
+     $                   ROTOK, RSVEC, UCTOL, UPPER
 *     ..
 *     ..
 *     .. Intrinsic Functions ..
@@ -422,7 +428,16 @@
       UPPER = LSAME( JOBA, 'U' )
       LOWER = LSAME( JOBA, 'L' )
 *
-      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+      ELSE
+         LWMIN  = M+N
+         LRWMIN = MAX( 6, N )
+      END IF
+*
+      LQUERY = ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 )
       IF( .NOT.( UPPER .OR. LOWER .OR. LSAME( JOBA, 'G' ) ) ) THEN
          INFO = -1
       ELSE IF( .NOT.( LSVEC .OR. UCTOL .OR. LSAME( JOBU, 'N' ) ) ) THEN
@@ -442,9 +457,9 @@
          INFO = -11
       ELSE IF( UCTOL .AND. ( RWORK( 1 ).LE.ONE ) ) THEN
          INFO = -12
-      ELSE IF( ( LWORK.LT.( M+N ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -13
-      ELSE IF( ( LRWORK.LT.MAX( N, 6 ) ) .AND. ( .NOT.LQUERY ) ) THEN
+      ELSE IF( LRWORK.LT.LRWMIN .AND. ( .NOT.LQUERY ) ) THEN
          INFO = -15
       ELSE
          INFO = 0
@@ -454,15 +469,15 @@
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZGESVJ', -INFO )
          RETURN
-      ELSE IF ( LQUERY ) THEN
-         CWORK(1) = M + N
-         RWORK(1) = MAX( N, 6 )
+      ELSE IF( LQUERY ) THEN
+         CWORK( 1 ) = LWMIN
+         RWORK( 1 ) = LRWMIN
          RETURN
       END IF
 *
 * #:) Quick return for void matrix
 *
-      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) )RETURN
+      IF( MINMN.EQ.0 ) RETURN
 *
 *     Set numerical parameters
 *     The stopping criterion for Jacobi rotations is
diff --git a/lapack-netlib/SRC/zgetf2.f b/lapack-netlib/SRC/zgetf2.f
index c247f8645..7c63dbbee 100644
--- a/lapack-netlib/SRC/zgetf2.f
+++ b/lapack-netlib/SRC/zgetf2.f
@@ -101,7 +101,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup getf2
 *
 *  =====================================================================
       SUBROUTINE ZGETF2( M, N, A, LDA, IPIV, INFO )
@@ -127,7 +127,7 @@
 *     ..
 *     .. Local Scalars ..
       DOUBLE PRECISION   SFMIN
-      INTEGER            I, J, JP
+      INTEGER            J, JP
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH
@@ -135,7 +135,7 @@
       EXTERNAL           DLAMCH, IZAMAX
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           XERBLA, ZGERU, ZSCAL, ZSWAP
+      EXTERNAL           XERBLA, ZGERU, ZRSCL, ZSWAP
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN
@@ -181,15 +181,8 @@
 *
 *           Compute elements J+1:M of J-th column.
 *
-            IF( J.LT.M ) THEN
-               IF( ABS(A( J, J )) .GE. SFMIN ) THEN
-                  CALL ZSCAL( M-J, ONE / A( J, J ), A( J+1, J ), 1 )
-               ELSE
-                  DO 20 I = 1, M-J
-                     A( J+I, J ) = A( J+I, J ) / A( J, J )
-   20             CONTINUE
-               END IF
-            END IF
+            IF( J.LT.M )
+     $         CALL ZRSCL( M-J, A( J, J ), A( J+1, J ), 1 )
 *
          ELSE IF( INFO.EQ.0 ) THEN
 *
diff --git a/lapack-netlib/SRC/zgetri.f b/lapack-netlib/SRC/zgetri.f
index 41782841c..f3806a77c 100644
--- a/lapack-netlib/SRC/zgetri.f
+++ b/lapack-netlib/SRC/zgetri.f
@@ -107,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEcomputational
+*> \ingroup getri
 *
 *  =====================================================================
       SUBROUTINE ZGETRI( N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -152,7 +152,7 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZGETRI', ' ', N, -1, -1, -1 )
-      LWKOPT = N*NB
+      LWKOPT = MAX( 1, N*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f
index 17c6d5146..26311c611 100644
--- a/lapack-netlib/SRC/zgetsls.f
+++ b/lapack-netlib/SRC/zgetsls.f
@@ -127,7 +127,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>          If LWORK = -1 or -2, then a workspace query is assumed.
 *>          If LWORK = -1, the routine calculates optimal size of WORK for the
 *>          optimal performance and returns this value in WORK(1).
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsolve
+*> \ingroup getsls
 *
 *  =====================================================================
       SUBROUTINE ZGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
@@ -192,7 +192,7 @@
 *     .. External Functions ..
       LOGICAL            LSAME
       DOUBLE PRECISION   DLAMCH, ZLANGE
-      EXTERNAL           LSAME, DLABAD, DLAMCH, ZLANGE
+      EXTERNAL           LSAME, DLAMCH, ZLANGE
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ZGEQR, ZGEMQR, ZLASCL, ZLASET,
@@ -229,7 +229,10 @@
 *
 *     Determine the optimum and minimum LWORK
 *
-       IF( M.GE.N ) THEN
+       IF( MIN( M, N, NRHS ).EQ.0 ) THEN
+         WSIZEO = 1
+         WSIZEM = 1
+       ELSE IF( M.GE.N ) THEN
          CALL ZGEQR( M, N, A, LDA, TQ, -1, WORKQ, -1, INFO2 )
          TSZO = INT( TQ( 1 ) )
          LWO  = INT( WORKQ( 1 ) )
@@ -297,7 +300,6 @@
 *
        SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
        BIGNUM = ONE / SMLNUM
-       CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale A, B if max element outside range [SMLNUM,BIGNUM]
 *
diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f
index 5f0167937..e7ce993aa 100644
--- a/lapack-netlib/SRC/zgetsqrhrt.f
+++ b/lapack-netlib/SRC/zgetsqrhrt.f
@@ -131,13 +131,15 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          The dimension of the array WORK.
-*>          LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
+*>          If MIN(M,N) = 0, LWORK >= 1, else
+*>          LWORK >= MAX( 1, LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ),
 *>          where
 *>             NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)),
 *>             NB1LOCAL = MIN(NB1,N).
 *>             LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL,
 *>             LW1 = NB1LOCAL * N,
-*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ),
+*>             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ).
+*>
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
 *>          array, returns this value as the first entry of the WORK
@@ -160,7 +162,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup comlpex16OTHERcomputational
+*> \ingroup getsqrhrt
 *
 *> \par Contributors:
 *  ==================
@@ -212,7 +214,7 @@
 *     Test the input arguments
 *
       INFO = 0
-      LQUERY  = LWORK.EQ.-1
+      LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
          INFO = -1
       ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
@@ -225,7 +227,7 @@
          INFO = -5
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
          INFO = -7
-      ELSE IF( LDT.LT.MAX( 1,  MIN( NB2, N ) ) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN
          INFO = -9
       ELSE
 *
@@ -263,8 +265,9 @@
             LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) )
 *
             LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) )
+            LWORKOPT = MAX( 1, LWORKOPT )
 *
-            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+            IF( LWORK.LT.LWORKOPT .AND. .NOT.LQUERY ) THEN
                INFO = -11
             END IF
 *
diff --git a/lapack-netlib/SRC/zgges3.f b/lapack-netlib/SRC/zgges3.f
index 8b3e44f88..8235c2543 100644
--- a/lapack-netlib/SRC/zgges3.f
+++ b/lapack-netlib/SRC/zgges3.f
@@ -215,7 +215,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N)
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -260,7 +261,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEeigen
+*> \ingroup gges3
 *
 *  =====================================================================
       SUBROUTINE ZGGES3( JOBVSL, JOBVSR, SORT, SELCTG, N, A, LDA, B,
@@ -300,7 +301,8 @@
       LOGICAL            CURSL, ILASCL, ILBSCL, ILVSL, ILVSR, LASTSL,
      $                   LQUERY, WANTST
       INTEGER            I, ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT,
-     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT
+     $                   ILO, IRIGHT, IROWS, IRWRK, ITAU, IWRK, LWKOPT,
+     $                   LWKMIN
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS, PVSL,
      $                   PVSR, SMLNUM
 *     ..
@@ -309,9 +311,8 @@
       DOUBLE PRECISION   DIF( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3,
-     $                   ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR,
-     $                   ZUNMQR
+      EXTERNAL           XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0,
+     $                   ZLACPY, ZLASCL, ZLASET, ZTGSEN, ZUNGQR, ZUNMQR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -353,6 +354,8 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
+*
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -369,7 +372,7 @@
          INFO = -14
       ELSE IF( LDVSR.LT.1 .OR. ( ILVSR .AND. LDVSR.LT.N ) ) THEN
          INFO = -16
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
@@ -377,28 +380,32 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1,  N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN,  N + INT( WORK( 1 ) ) )
          CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          IF( ILVSL ) THEN
             CALL ZUNGQR( N, N, N, VSL, LDVSL, WORK, WORK, -1, IERR )
             LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
          END IF
          CALL ZGGHD3( JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB, VSL,
      $                LDVSL, VSR, LDVSR, WORK, -1, IERR )
-         LWKOPT = MAX( LWKOPT, N + INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, N + INT( WORK( 1 ) ) )
          CALL ZLAQZ0( 'S', JOBVSL, JOBVSR, N, 1, N, A, LDA, B, LDB,
      $                ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, WORK, -1,
      $                RWORK, 0, IERR )
-         LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
          IF( WANTST ) THEN
             CALL ZTGSEN( 0, ILVSL, ILVSR, BWORK, N, A, LDA, B, LDB,
      $                   ALPHA, BETA, VSL, LDVSL, VSR, LDVSR, SDIM,
      $                   PVSL, PVSR, DIF, WORK, -1, IDUM, 1, IERR )
-            LWKOPT = MAX( LWKOPT, INT ( WORK( 1 ) ) )
+            LWKOPT = MAX( LWKOPT, INT( WORK( 1 ) ) )
+         END IF
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = DCMPLX( LWKOPT )
          END IF
-         WORK( 1 ) = DCMPLX( LWKOPT )
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -420,7 +427,6 @@
       EPS = DLAMCH( 'P' )
       SMLNUM = DLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/zggev3.f b/lapack-netlib/SRC/zggev3.f
index 2d6c74582..0cc073470 100644
--- a/lapack-netlib/SRC/zggev3.f
+++ b/lapack-netlib/SRC/zggev3.f
@@ -174,7 +174,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= MAX(1,2*N).
+*>          For good performance, LWORK must generally be larger.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -208,7 +209,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEeigen
+*> \ingroup ggev3
 *
 *  =====================================================================
       SUBROUTINE ZGGEV3( JOBVL, JOBVR, N, A, LDA, B, LDB, ALPHA, BETA,
@@ -243,7 +244,7 @@
       CHARACTER          CHTEMP
       INTEGER            ICOLS, IERR, IHI, IJOBVL, IJOBVR, ILEFT, ILO,
      $                   IN, IRIGHT, IROWS, IRWRK, ITAU, IWRK, JC, JR,
-     $                   LWKOPT
+     $                   LWKMIN, LWKOPT
       DOUBLE PRECISION   ANRM, ANRMTO, BIGNUM, BNRM, BNRMTO, EPS,
      $                   SMLNUM, TEMP
       COMPLEX*16         X
@@ -252,9 +253,8 @@
       LOGICAL            LDUMMA( 1 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3,
-     $                   ZLAQZ0, ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR,
-     $                   ZUNMQR
+      EXTERNAL           XERBLA, ZGEQRF, ZGGBAK, ZGGBAL, ZGGHD3, ZLAQZ0,
+     $                   ZLACPY, ZLASCL, ZLASET, ZTGEVC, ZUNGQR, ZUNMQR
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -301,6 +301,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( IJOBVL.LE.0 ) THEN
          INFO = -1
       ELSE IF( IJOBVR.LE.0 ) THEN
@@ -315,7 +316,7 @@
          INFO = -11
       ELSE IF( LDVR.LT.1 .OR. ( ILVR .AND. LDVR.LT.N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -15
       END IF
 *
@@ -323,7 +324,7 @@
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZGEQRF( N, N, B, LDB, WORK, WORK, -1, IERR )
-         LWKOPT = MAX( 1,  N+INT( WORK( 1 ) ) )
+         LWKOPT = MAX( LWKMIN, N+INT( WORK( 1 ) ) )
          CALL ZUNMQR( 'L', 'C', N, N, N, B, LDB, WORK, A, LDA, WORK,
      $                -1, IERR )
          LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
@@ -348,7 +349,11 @@
      $                   RWORK, 0, IERR )
             LWKOPT = MAX( LWKOPT, N+INT( WORK( 1 ) ) )
          END IF
-         WORK( 1 ) = DCMPLX( LWKOPT )
+         IF( N.EQ.0 ) THEN
+            WORK( 1 ) = 1
+         ELSE
+            WORK( 1 ) = DCMPLX( LWKOPT )
+         END IF
       END IF
 *
       IF( INFO.NE.0 ) THEN
@@ -368,7 +373,6 @@
       EPS = DLAMCH( 'E' )*DLAMCH( 'B' )
       SMLNUM = DLAMCH( 'S' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       SMLNUM = SQRT( SMLNUM ) / EPS
       BIGNUM = ONE / SMLNUM
 *
diff --git a/lapack-netlib/SRC/zgghd3.f b/lapack-netlib/SRC/zgghd3.f
index b29cdc70a..08343688d 100644
--- a/lapack-netlib/SRC/zgghd3.f
+++ b/lapack-netlib/SRC/zgghd3.f
@@ -176,14 +176,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in]  LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= 1.
+*>          The length of the array WORK. LWORK >= 1.
 *>          For optimum performance LWORK >= 6*N*NB, where NB is the
 *>          optimal blocksize.
 *>
@@ -208,7 +208,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup gghd3
 *
 *> \par Further Details:
 *  =====================
@@ -275,7 +275,12 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZGGHD3', ' ', N, ILO, IHI, -1 )
-      LWKOPT = MAX( 6*N*NB, 1 )
+      NH = IHI - ILO + 1
+      IF( NH.LE.1 ) THEN
+         LWKOPT = 1
+      ELSE
+         LWKOPT = 6*N*NB
+      END IF
       WORK( 1 ) = DCMPLX( LWKOPT )
       INITQ = LSAME( COMPQ, 'I' )
       WANTQ = INITQ .OR. LSAME( COMPQ, 'V' )
@@ -325,7 +330,6 @@
 *
 *     Quick return if possible
 *
-      NH = IHI - ILO + 1
       IF( NH.LE.1 ) THEN
          WORK( 1 ) = CONE
          RETURN
@@ -883,6 +887,7 @@
       IF ( JCOL.LT.IHI )
      $   CALL ZGGHRD( COMPQ2, COMPZ2, N, JCOL, IHI, A, LDA, B, LDB, Q,
      $                LDQ, Z, LDZ, IERR )
+*
       WORK( 1 ) = DCMPLX( LWKOPT )
 *
       RETURN
diff --git a/lapack-netlib/SRC/zggqrf.f b/lapack-netlib/SRC/zggqrf.f
index 0388b0874..d8636d663 100644
--- a/lapack-netlib/SRC/zggqrf.f
+++ b/lapack-netlib/SRC/zggqrf.f
@@ -173,7 +173,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup ggqrf
 *
 *> \par Further Details:
 *  =====================
@@ -250,7 +250,7 @@
       NB2 = ILAENV( 1, 'ZGERQF', ' ', N, P, -1, -1 )
       NB3 = ILAENV( 1, 'ZUNMQR', ' ', N, M, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zggrqf.f b/lapack-netlib/SRC/zggrqf.f
index be912c772..69c14af24 100644
--- a/lapack-netlib/SRC/zggrqf.f
+++ b/lapack-netlib/SRC/zggrqf.f
@@ -172,7 +172,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup ggrqf
 *
 *> \par Further Details:
 *  =====================
@@ -249,7 +249,7 @@
       NB2 = ILAENV( 1, 'ZGEQRF', ' ', P, N, -1, -1 )
       NB3 = ILAENV( 1, 'ZUNMRQ', ' ', M, N, P, -1 )
       NB = MAX( NB1, NB2, NB3 )
-      LWKOPT = MAX( N, M, P )*NB
+      LWKOPT = MAX( 1, MAX( N, M, P )*NB )
       WORK( 1 ) = LWKOPT
       LQUERY = ( LWORK.EQ.-1 )
       IF( M.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zggsvd3.f b/lapack-netlib/SRC/zggsvd3.f
index 71257a7c0..40624f5be 100644
--- a/lapack-netlib/SRC/zggsvd3.f
+++ b/lapack-netlib/SRC/zggsvd3.f
@@ -277,7 +277,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -332,7 +332,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16GEsing
+*> \ingroup ggsvd3
 *
 *> \par Contributors:
 *  ==================
diff --git a/lapack-netlib/SRC/zggsvp3.f b/lapack-netlib/SRC/zggsvp3.f
index f39ccdad3..7b465aaee 100644
--- a/lapack-netlib/SRC/zggsvp3.f
+++ b/lapack-netlib/SRC/zggsvp3.f
@@ -233,7 +233,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.
+*>          The dimension of the array WORK. LWORK >= 1.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -256,7 +256,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup ggsvp3
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/zgsvj0.f b/lapack-netlib/SRC/zgsvj0.f
index 672ab7590..075c243c2 100644
--- a/lapack-netlib/SRC/zgsvj0.f
+++ b/lapack-netlib/SRC/zgsvj0.f
@@ -52,10 +52,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX*16 array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zgsvj1.f b/lapack-netlib/SRC/zgsvj1.f
index 2461a9555..ba71e155f 100644
--- a/lapack-netlib/SRC/zgsvj1.f
+++ b/lapack-netlib/SRC/zgsvj1.f
@@ -75,10 +75,10 @@
 *>          Specifies whether the output from this procedure is used
 *>          to compute the matrix V:
 *>          = 'V': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the N-by-N array V.
+*>                 by postmultiplying the N-by-N array V.
 *>                (See the description of V.)
 *>          = 'A': the product of the Jacobi rotations is accumulated
-*>                 by postmulyiplying the MV-by-N array V.
+*>                 by postmultiplying the MV-by-N array V.
 *>                (See the descriptions of MV and V.)
 *>          = 'N': the Jacobi rotations are not accumulated.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX*16 array, dimension (LDV,N)
-*>          If JOBV = 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV = 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multiplied by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhbevd.f b/lapack-netlib/SRC/zhbevd.f
index 0db551540..be9f01556 100644
--- a/lapack-netlib/SRC/zhbevd.f
+++ b/lapack-netlib/SRC/zhbevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian band matrix A.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhbevd_2stage.f b/lapack-netlib/SRC/zhbevd_2stage.f
index 4522d5e79..e32c7125c 100644
--- a/lapack-netlib/SRC/zhbevd_2stage.f
+++ b/lapack-netlib/SRC/zhbevd_2stage.f
@@ -47,12 +47,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it
 *> uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhbgvd.f b/lapack-netlib/SRC/zhbgvd.f
index b0664750e..4bd02168d 100644
--- a/lapack-netlib/SRC/zhbgvd.f
+++ b/lapack-netlib/SRC/zhbgvd.f
@@ -46,12 +46,6 @@
 *> and banded, and B is also positive definite.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zheevd.f b/lapack-netlib/SRC/zheevd.f
index 7f58c7f72..8e86b9e88 100644
--- a/lapack-netlib/SRC/zheevd.f
+++ b/lapack-netlib/SRC/zheevd.f
@@ -41,12 +41,6 @@
 *> complex Hermitian matrix A.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -122,8 +116,7 @@
 *>
 *> \param[out] RWORK
 *> \verbatim
-*>          RWORK is DOUBLE PRECISION array,
-*>                                         dimension (LRWORK)
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,LRWORK))
 *>          On exit, if INFO = 0, RWORK(1) returns the optimal LRWORK.
 *> \endverbatim
 *>
@@ -186,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEeigen
+*> \ingroup heevd
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/zheevd_2stage.f b/lapack-netlib/SRC/zheevd_2stage.f
index 9859b0d67..e697a9823 100644
--- a/lapack-netlib/SRC/zheevd_2stage.f
+++ b/lapack-netlib/SRC/zheevd_2stage.f
@@ -46,12 +46,6 @@
 *> the reduction to tridiagonal.  If eigenvectors are desired, it uses a
 *> divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f
index 1452e04a3..fe6e1a85f 100644
--- a/lapack-netlib/SRC/zheevr.f
+++ b/lapack-netlib/SRC/zheevr.f
@@ -272,7 +272,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of the array WORK.  LWORK >= max(1,2*N).
+*>          The length of the array WORK.
+*>          If N <= 1, LWORK >= 1, else LWORK >= 2*N.
 *>          For optimal efficiency, LWORK >= (NB+1)*N,
 *>          where NB is the max of the blocksize for ZHETRD and for
 *>          ZUNMTR as returned by ILAENV.
@@ -294,7 +295,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -313,7 +315,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -338,7 +341,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEeigen
+*> \ingroup heevr
 *
 *> \par Contributors:
 *  ==================
@@ -417,9 +420,15 @@
       LQUERY = ( ( LWORK.EQ.-1 ) .OR. ( LRWORK.EQ.-1 ) .OR.
      $         ( LIWORK.EQ.-1 ) )
 *
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
-      LWMIN = MAX( 1, 2*N )
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = 2*N
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( WANTZ .OR. LSAME( JOBZ, 'N' ) ) ) THEN
@@ -454,7 +463,7 @@
          NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 )
          NB = MAX( NB, ILAENV( 1, 'ZUNMTR', UPLO, N, -1, -1, -1 ) )
          LWKOPT = MAX( ( NB+1 )*N, LWMIN )
-         WORK( 1 ) = LWKOPT
+         WORK( 1 )  = LWKOPT
          RWORK( 1 ) = LRWMIN
          IWORK( 1 ) = LIWMIN
 *
@@ -483,7 +492,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = DBLE( A( 1, 1 ) )
@@ -710,7 +719,7 @@
 *
 *     Set WORK(1) to optimal workspace size.
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 )  = LWKOPT
       RWORK( 1 ) = LRWMIN
       IWORK( 1 ) = LIWMIN
 *
diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f
index 5c576e633..b1cc7175f 100644
--- a/lapack-netlib/SRC/zheevr_2stage.f
+++ b/lapack-netlib/SRC/zheevr_2stage.f
@@ -265,7 +265,7 @@
 *>          indicating the nonzero elements in Z. The i-th eigenvector
 *>          is nonzero only in elements ISUPPZ( 2*i-1 ) through
 *>          ISUPPZ( 2*i ). This is an output of ZSTEMR (tridiagonal
-*>          matrix). The support of the eigenvectors of A is typically 
+*>          matrix). The support of the eigenvectors of A is typically
 *>          1:N because of the unitary transformations applied by ZUNMTR.
 *>          Implemented only for RANGE = 'A' or 'I' and IU - IL = N - 1
 *> \endverbatim
@@ -279,12 +279,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK.  
+*>          The dimension of the array WORK.
+*>          If N <= 1,               LWORK must be at least 1.
 *>          If JOBZ = 'N' and N > 1, LWORK must be queried.
 *>                                   LWORK = MAX(1, 26*N, dimension) where
 *>                                   dimension = max(stage1,stage2) + (KD+1)*N + N
-*>                                             = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                                               + max(2*KD*KD, KD*NTHREADS) 
+*>                                             = N*KD + N*max(KD+1,FACTOPTNB)
+*>                                               + max(2*KD*KD, KD*NTHREADS)
 *>                                               + (KD+1)*N + N
 *>                                   where KD is the blocking size of the reduction,
 *>                                   FACTOPTNB is the blocking used by the QR or LQ
@@ -310,7 +311,8 @@
 *> \param[in] LRWORK
 *> \verbatim
 *>          LRWORK is INTEGER
-*>          The length of the array RWORK.  LRWORK >= max(1,24*N).
+*>          The length of the array RWORK.
+*>          If N <= 1, LRWORK >= 1, else LRWORK >= 24*N.
 *>
 *>          If LRWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -329,7 +331,8 @@
 *> \param[in] LIWORK
 *> \verbatim
 *>          LIWORK is INTEGER
-*>          The dimension of the array IWORK.  LIWORK >= max(1,10*N).
+*>          The dimension of the array IWORK.
+*>          If N <= 1, LIWORK >= 1, else LIWORK >= 10*N.
 *>
 *>          If LIWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal sizes of the WORK, RWORK
@@ -354,7 +357,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEeigen
+*> \ingroup heevr_2stage
 *
 *> \par Contributors:
 *  ==================
@@ -382,7 +385,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -390,11 +393,11 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *
@@ -472,9 +475,16 @@
       IB     = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', JOBZ, N, KD, -1, -1 )
       LHTRD  = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
       LWTRD  = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', JOBZ, N, KD, IB, -1 )
-      LWMIN  = N + LHTRD + LWTRD
-      LRWMIN = MAX( 1, 24*N )
-      LIWMIN = MAX( 1, 10*N )
+*
+      IF( N.LE.1 ) THEN
+         LWMIN  = 1
+         LRWMIN = 1
+         LIWMIN = 1
+      ELSE
+         LWMIN  = N + LHTRD + LWTRD
+         LRWMIN = 24*N
+         LIWMIN = 10*N
+      END IF
 *
       INFO = 0
       IF( .NOT.( LSAME( JOBZ, 'N' ) ) ) THEN
@@ -535,7 +545,7 @@
       END IF
 *
       IF( N.EQ.1 ) THEN
-         WORK( 1 ) = 2
+         WORK( 1 ) = 1
          IF( ALLEIG .OR. INDEIG ) THEN
             M = 1
             W( 1 ) = DBLE( A( 1, 1 ) )
@@ -643,9 +653,9 @@
 *
 *     Call ZHETRD_2STAGE to reduce Hermitian matrix to tridiagonal form.
 *
-      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ), 
+      CALL ZHETRD_2STAGE( JOBZ, UPLO, N, A, LDA, RWORK( INDRD ),
      $                    RWORK( INDRE ), WORK( INDTAU ),
-     $                    WORK( INDHOUS ), LHTRD, 
+     $                    WORK( INDHOUS ), LHTRD,
      $                    WORK( INDWK ), LLWORK, IINFO )
 *
 *     If all eigenvalues are desired
diff --git a/lapack-netlib/SRC/zhegv.f b/lapack-netlib/SRC/zhegv.f
index 41657e3be..c973bd0fc 100644
--- a/lapack-netlib/SRC/zhegv.f
+++ b/lapack-netlib/SRC/zhegv.f
@@ -160,7 +160,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhegv_2stage.f b/lapack-netlib/SRC/zhegv_2stage.f
index fda651e5e..91ac09311 100644
--- a/lapack-netlib/SRC/zhegv_2stage.f
+++ b/lapack-netlib/SRC/zhegv_2stage.f
@@ -179,7 +179,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not converge to zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhegvd.f b/lapack-netlib/SRC/zhegvd.f
index eeda656ad..c9ff55e3d 100644
--- a/lapack-netlib/SRC/zhegvd.f
+++ b/lapack-netlib/SRC/zhegvd.f
@@ -43,12 +43,6 @@
 *> B are assumed to be Hermitian and B is also positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -212,7 +206,7 @@
 *>                    the submatrix lying in rows and columns INFO/(N+1)
 *>                    through mod(INFO,N+1);
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhegvx.f b/lapack-netlib/SRC/zhegvx.f
index ac9f9ef1a..71ed1c4ca 100644
--- a/lapack-netlib/SRC/zhegvx.f
+++ b/lapack-netlib/SRC/zhegvx.f
@@ -280,7 +280,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= N, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f
index df8498c7a..b3d4b3725 100644
--- a/lapack-netlib/SRC/zhesv_aa.f
+++ b/lapack-netlib/SRC/zhesv_aa.f
@@ -128,7 +128,7 @@
 *>          LWORK is INTEGER
 *>          The length of WORK.  LWORK >= MAX(1,2*N,3*N-2), and for best 
 *>          performance LWORK >= max(1,N*NB), where NB is the optimal
-*>          blocksize for ZHETRF.
+*>          blocksize for ZHETRF_AA.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -154,7 +154,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEsolve
+*> \ingroup hesv_aa
 *
 *  =====================================================================
       SUBROUTINE ZHESV_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
@@ -177,7 +177,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            LQUERY
-      INTEGER            LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
+      INTEGER            LWKMIN, LWKOPT, LWKOPT_HETRF, LWKOPT_HETRS
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -196,6 +196,7 @@
 *
       INFO = 0
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N, 3*N-2 )
       IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -206,17 +207,17 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX(2*N, 3*N-2) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, -1, INFO )
-         LWKOPT_HETRF = INT( WORK(1) )
+         LWKOPT_HETRF = INT( WORK( 1 ) )
          CALL ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB, WORK,
      $                   -1, INFO )
-         LWKOPT_HETRS = INT( WORK(1) )
-         LWKOPT = MAX( LWKOPT_HETRF, LWKOPT_HETRS )
+         LWKOPT_HETRS = INT( WORK( 1 ) )
+         LWKOPT = MAX( LWKMIN, LWKOPT_HETRF, LWKOPT_HETRS )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f
index ee1596855..c503b5554 100644
--- a/lapack-netlib/SRC/zhesv_aa_2stage.f
+++ b/lapack-netlib/SRC/zhesv_aa_2stage.f
@@ -88,7 +88,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -100,14 +100,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX*16 array, dimension (LTB)
+*>          TB is COMPLEX*16 array, dimension (MAX(1,LTB)).
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -147,14 +147,15 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 workspace of size LWORK
+*>          WORK is COMPLEX*16 workspace of size (MAX(1,LWORK)).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -178,7 +179,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEsolve
+*> \ingroup hesv_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE ZHESV_AA_2STAGE( UPLO, N, NRHS, A, LDA, TB, LTB,
@@ -208,7 +209,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            LWKOPT
+      INTEGER            LWKOPT, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -229,6 +230,7 @@
       UPPER = LSAME( UPLO, 'U' )
       WQUERY = ( LWORK.EQ.-1 )
       TQUERY = ( LTB.EQ.-1 )
+      LWKMIN = MAX( 1, N )
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -237,18 +239,19 @@
          INFO = -3
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -5
-      ELSE IF( LTB.LT.( 4*N ) .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -7
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -11
-      ELSE IF( LWORK.LT.N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.WQUERY ) THEN
          INFO = -13
       END IF
 *
       IF( INFO.EQ.0 ) THEN
          CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, -1, IPIV,
      $                          IPIV2, WORK, -1, INFO )
-         LWKOPT = INT( WORK(1) )
+         LWKOPT = MAX( LWKMIN, INT( WORK( 1 ) ) )
+         WORK( 1 ) = LWKOPT
       END IF
 *
       IF( INFO.NE.0 ) THEN
diff --git a/lapack-netlib/SRC/zhesvx.f b/lapack-netlib/SRC/zhesvx.f
index 485c81df6..64aa16674 100644
--- a/lapack-netlib/SRC/zhesvx.f
+++ b/lapack-netlib/SRC/zhesvx.f
@@ -234,8 +234,8 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >= max(1,2*N), and for best
-*>          performance, when FACT = 'N', LWORK >= max(1,2*N,N*NB), where
+*>          The length of WORK.  LWORK >= MAX(1,2*N), and for best
+*>          performance, when FACT = 'N', LWORK >= MAX(1,2*N,N*NB), where
 *>          NB is the optimal blocksize for ZHETRF.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -276,7 +276,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEsolve
+*> \ingroup hesvx
 *
 *  =====================================================================
       SUBROUTINE ZHESVX( FACT, UPLO, N, NRHS, A, LDA, AF, LDAF, IPIV, B,
@@ -307,7 +307,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, NOFACT
-      INTEGER            LWKOPT, NB
+      INTEGER            LWKOPT, LWKMIN, NB
       DOUBLE PRECISION   ANORM
 *     ..
 *     .. External Functions ..
@@ -329,6 +329,7 @@
       INFO = 0
       NOFACT = LSAME( FACT, 'N' )
       LQUERY = ( LWORK.EQ.-1 )
+      LWKMIN = MAX( 1, 2*N )
       IF( .NOT.NOFACT .AND. .NOT.LSAME( FACT, 'F' ) ) THEN
          INFO = -1
       ELSE IF( .NOT.LSAME( UPLO, 'U' ) .AND. .NOT.LSAME( UPLO, 'L' ) )
@@ -346,12 +347,12 @@
          INFO = -11
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -13
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -18
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = MAX( 1, 2*N )
+         LWKOPT = LWKMIN
          IF( NOFACT ) THEN
             NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 )
             LWKOPT = MAX( LWKOPT, N*NB )
diff --git a/lapack-netlib/SRC/zhetf2_rk.f b/lapack-netlib/SRC/zhetf2_rk.f
index 050c7993d..87df901aa 100644
--- a/lapack-netlib/SRC/zhetf2_rk.f
+++ b/lapack-netlib/SRC/zhetf2_rk.f
@@ -480,7 +480,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -508,7 +508,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
@@ -834,7 +834,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -862,7 +862,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/zhetf2_rook.f b/lapack-netlib/SRC/zhetf2_rook.f
index 94bb29736..91172f601 100644
--- a/lapack-netlib/SRC/zhetf2_rook.f
+++ b/lapack-netlib/SRC/zhetf2_rook.f
@@ -420,7 +420,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    14          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -441,7 +441,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    15          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
@@ -733,7 +733,7 @@
                   A( J, K ) = DCONJG( A( P, J ) )
                   A( P, J ) = T
    44          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( P, K ) = DCONJG( A( P, K ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( K, K ) )
@@ -754,7 +754,7 @@
                   A( J, KK ) = DCONJG( A( KP, J ) )
                   A( KP, J ) = T
    45          CONTINUE
-*              (3) Swap and conjugate corner elements at row-col interserction
+*              (3) Swap and conjugate corner elements at row-col intersection
                A( KP, KK ) = DCONJG( A( KP, KK ) )
 *              (4) Swap diagonal elements at row-col intersection
                R1 = DBLE( A( KK, KK ) )
diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f
index b9d2f0eb1..ab444894b 100644
--- a/lapack-netlib/SRC/zhetrd_2stage.f
+++ b/lapack-netlib/SRC/zhetrd_2stage.f
@@ -4,23 +4,23 @@
 *
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download ZHETRD_2STAGE + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd_2stage.f"> 
+*> Download ZHETRD_2STAGE + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhetrd_2stage.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhetrd_2stage.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhetrd_2stage.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+*       SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
 *                                 HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
 *       IMPLICIT NONE
@@ -34,7 +34,7 @@
 *       COMPLEX*16         A( LDA, * ), TAU( * ),
 *                          HOUS2( * ), WORK( * )
 *       ..
-*  
+*
 *
 *> \par Purpose:
 *  =============
@@ -52,11 +52,11 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  in particular for the second stage (Band to
 *>                  tridiagonal) and thus LHOUS2 is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate Q1 Q2 or to apply Q1 Q2, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate Q1 Q2 or to apply Q1 Q2,
 *>                  then LHOUS2 is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -86,7 +86,7 @@
 *>          triangular part of A is not referenced.
 *>          On exit, if UPLO = 'U', the band superdiagonal
 *>          of A are overwritten by the corresponding elements of the
-*>          internal band-diagonal matrix AB, and the elements above 
+*>          internal band-diagonal matrix AB, and the elements above
 *>          the KD superdiagonal, with the array TAU, represent the unitary
 *>          matrix Q1 as a product of elementary reflectors; if UPLO
 *>          = 'L', the diagonal and band subdiagonal of A are over-
@@ -117,13 +117,13 @@
 *> \param[out] TAU
 *> \verbatim
 *>          TAU is COMPLEX*16 array, dimension (N-KD)
-*>          The scalar factors of the elementary reflectors of 
+*>          The scalar factors of the elementary reflectors of
 *>          the first stage (see Further Details).
 *> \endverbatim
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is COMPLEX*16 array, dimension (LHOUS2)
+*>          HOUS2 is COMPLEX*16 array, dimension (MAX(1,LHOUS2))
 *>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
@@ -132,6 +132,8 @@
 *> \verbatim
 *>          LHOUS2 is INTEGER
 *>          The dimension of the array HOUS2.
+*>          LHOUS2 >= 1.
+*>
 *>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
@@ -143,23 +145,26 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *>          LWORK = MAX(1, dimension) where
 *>          dimension   = max(stage1,stage2) + (KD+1)*N
-*>                      = N*KD + N*max(KD+1,FACTOPTNB) 
-*>                        + max(2*KD*KD, KD*NTHREADS) 
-*>                        + (KD+1)*N 
+*>                      = N*KD + N*max(KD+1,FACTOPTNB)
+*>                        + max(2*KD*KD, KD*NTHREADS)
+*>                        + (KD+1)*N
 *>          where KD is the blocking size of the reduction,
 *>          FACTOPTNB is the blocking used by the QR or LQ
 *>          algorithm, usually FACTOPTNB=128 is a good choice
@@ -177,12 +182,12 @@
 *  Authors:
 *  ========
 *
-*> \author Univ. of Tennessee 
-*> \author Univ. of California Berkeley 
-*> \author Univ. of Colorado Denver 
-*> \author NAG Ltd. 
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrd_2stage
 *
 *> \par Further Details:
 *  =====================
@@ -202,7 +207,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -210,16 +215,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU, 
+      SUBROUTINE ZHETRD_2STAGE( VECT, UPLO, N, A, LDA, D, E, TAU,
      $                          HOUS2, LHOUS2, WORK, LWORK, INFO )
 *
       IMPLICIT NONE
@@ -265,10 +270,13 @@
 *
       KD     = ILAENV2STAGE( 1, 'ZHETRD_2STAGE', VECT, N, -1, -1, -1 )
       IB     = ILAENV2STAGE( 2, 'ZHETRD_2STAGE', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
-*      WRITE(*,*),'ZHETRD_2STAGE N KD UPLO LHMIN LWMIN ',N, KD, UPLO,
-*     $            LHMIN, LWMIN
+      IF( N.EQ.0 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'ZHETRD_2STAGE', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.LSAME( VECT, 'N' ) ) THEN
          INFO = -1
@@ -309,14 +317,14 @@
       LWRK  = LWORK-LDAB*N
       ABPOS = 1
       WPOS  = ABPOS + LDAB*N
-      CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB, 
+      CALL ZHETRD_HE2HB( UPLO, N, KD, A, LDA, WORK( ABPOS ), LDAB,
      $                   TAU, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRD_HE2HB', -INFO )
          RETURN
       END IF
-      CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD, 
-     $                   WORK( ABPOS ), LDAB, D, E, 
+      CALL ZHETRD_HB2ST( 'Y', VECT, UPLO, N, KD,
+     $                   WORK( ABPOS ), LDAB, D, E,
      $                   HOUS2, LHOUS2, WORK( WPOS ), LWRK, INFO )
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRD_HB2ST', -INFO )
@@ -324,7 +332,6 @@
       END IF
 *
 *
-      HOUS2( 1 ) = LHMIN
       WORK( 1 )  = LWMIN
       RETURN
 *
diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F
index e839271a4..247497ab6 100644
--- a/lapack-netlib/SRC/zhetrd_hb2st.F
+++ b/lapack-netlib/SRC/zhetrd_hb2st.F
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+*       SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
 *                               D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *       #if defined(_OPENMP)
@@ -53,12 +53,12 @@
 *> \param[in] STAGE1
 *> \verbatim
 *>          STAGE1 is CHARACTER*1
-*>          = 'N':  "No": to mention that the stage 1 of the reduction  
+*>          = 'N':  "No": to mention that the stage 1 of the reduction
 *>                  from dense to band using the zhetrd_he2hb routine
-*>                  was not called before this routine to reproduce AB. 
-*>                  In other term this routine is called as standalone. 
-*>          = 'Y':  "Yes": to mention that the stage 1 of the 
-*>                  reduction from dense to band using the zhetrd_he2hb 
+*>                  was not called before this routine to reproduce AB.
+*>                  In other term this routine is called as standalone.
+*>          = 'Y':  "Yes": to mention that the stage 1 of the
+*>                  reduction from dense to band using the zhetrd_he2hb
 *>                  routine has been called to produce AB (e.g., AB is
 *>                  the output of zhetrd_he2hb.
 *> \endverbatim
@@ -66,10 +66,10 @@
 *> \param[in] VECT
 *> \verbatim
 *>          VECT is CHARACTER*1
-*>          = 'N':  No need for the Housholder representation, 
+*>          = 'N':  No need for the Housholder representation,
 *>                  and thus LHOUS is of size max(1, 4*N);
-*>          = 'V':  the Householder representation is needed to 
-*>                  either generate or to apply Q later on, 
+*>          = 'V':  the Householder representation is needed to
+*>                  either generate or to apply Q later on,
 *>                  then LHOUS is to be queried and computed.
 *>                  (NOT AVAILABLE IN THIS RELEASE).
 *> \endverbatim
@@ -132,34 +132,39 @@
 *>
 *> \param[out] HOUS
 *> \verbatim
-*>          HOUS is COMPLEX*16 array, dimension LHOUS, that
-*>          store the Householder representation.
+*>          HOUS is COMPLEX*16 array, dimension (MAX(1,LHOUS))
+*>          Stores the Householder representation.
 *> \endverbatim
 *>
 *> \param[in] LHOUS
 *> \verbatim
 *>          LHOUS is INTEGER
-*>          The dimension of the array HOUS. LHOUS = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array HOUS.
+*>          If N = 0 or KD <= 1, LHOUS >= 1, else LHOUS = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS array, returns
 *>          this value as the first entry of the HOUS array, and no error
 *>          message related to LHOUS is issued by XERBLA.
 *>          LHOUS = MAX(1, dimension) where
 *>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'     
+*>          not available now if VECT='H'
 *> \endverbatim
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension LWORK.
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)).
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS=-1,
+*>          The dimension of the array WORK.
+*>          If N = 0 or KD <= 1, LWORK >= 1, else LWORK = MAX(1, dimension).
+*>
+*>          If LWORK = -1, or LHOUS = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -188,7 +193,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup hetrd_hb2st
 *
 *> \par Further Details:
 *  =====================
@@ -208,7 +213,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -216,16 +221,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB, 
+      SUBROUTINE ZHETRD_HB2ST( STAGE1, VECT, UPLO, N, KD, AB, LDAB,
      $                         D, E, HOUS, LHOUS, WORK, LWORK, INFO )
 *
 *
@@ -259,11 +264,11 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, WANTQ, UPPER, AFTERS1
-      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST, 
+      INTEGER            I, M, K, IB, SWEEPID, MYID, SHIFT, STT, ST,
      $                   ED, STIND, EDIND, BLKLASTIND, COLPT, THED,
      $                   STEPERCOL, GRSIZ, THGRSIZ, THGRNB, THGRID,
-     $                   NBTILES, TTYPE, TID, NTHREADS, DEBUG,
-     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS, 
+     $                   NBTILES, TTYPE, TID, NTHREADS,
+     $                   ABDPOS, ABOFDPOS, DPOS, OFDPOS, AWPOS,
      $                   INDA, INDW, APOS, SIZEA, LDA, INDV, INDTAU,
      $                   SIZEV, SIZETAU, LDV, LHMIN, LWMIN
       DOUBLE PRECISION   ABSTMP
@@ -277,7 +282,7 @@
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
-      INTEGER            ILAENV2STAGE 
+      INTEGER            ILAENV2STAGE
       EXTERNAL           LSAME, ILAENV2STAGE
 *     ..
 *     .. Executable Statements ..
@@ -285,7 +290,6 @@
 *     Determine the minimal workspace size required.
 *     Test the input parameters
 *
-      DEBUG   = 0
       INFO    = 0
       AFTERS1 = LSAME( STAGE1, 'Y' )
       WANTQ   = LSAME( VECT, 'V' )
@@ -294,9 +298,14 @@
 *
 *     Determine the block size, the workspace size and the hous size.
 *
-      IB     = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
-      LHMIN  = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
-      LWMIN  = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      IB       = ILAENV2STAGE( 2, 'ZHETRD_HB2ST', VECT, N, KD, -1, -1 )
+      IF( N.EQ.0 .OR. KD.LE.1 ) THEN
+         LHMIN = 1
+         LWMIN = 1
+      ELSE
+         LHMIN = ILAENV2STAGE( 3, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+         LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HB2ST', VECT, N, KD, IB, -1 )
+      END IF
 *
       IF( .NOT.AFTERS1 .AND. .NOT.LSAME( STAGE1, 'N' ) ) THEN
          INFO = -1
@@ -358,7 +367,7 @@
           ABDPOS   = KD + 1
           ABOFDPOS = KD
       ELSE
-          APOS     = INDA 
+          APOS     = INDA
           AWPOS    = INDA + KD + 1
           DPOS     = APOS
           OFDPOS   = DPOS + 1
@@ -366,11 +375,11 @@
           ABOFDPOS = 2
 
       ENDIF
-*      
-*     Case KD=0: 
-*     The matrix is diagonal. We just copy it (convert to "real" for 
-*     complex because D is double and the imaginary part should be 0) 
-*     and store it in D. A sequential code here is better or 
+*
+*     Case KD=0:
+*     The matrix is diagonal. We just copy it (convert to "real" for
+*     complex because D is double and the imaginary part should be 0)
+*     and store it in D. A sequential code here is better or
 *     in a parallel environment it might need two cores for D and E
 *
       IF( KD.EQ.0 ) THEN
@@ -385,17 +394,17 @@
           WORK( 1 ) = 1
           RETURN
       END IF
-*      
-*     Case KD=1: 
-*     The matrix is already Tridiagonal. We have to make diagonal 
+*
+*     Case KD=1:
+*     The matrix is already Tridiagonal. We have to make diagonal
 *     and offdiagonal elements real, and store them in D and E.
-*     For that, for real precision just copy the diag and offdiag 
-*     to D and E while for the COMPLEX case the bulge chasing is  
-*     performed to convert the hermetian tridiagonal to symmetric 
-*     tridiagonal. A simpler conversion formula might be used, but then 
+*     For that, for real precision just copy the diag and offdiag
+*     to D and E while for the COMPLEX case the bulge chasing is
+*     performed to convert the hermetian tridiagonal to symmetric
+*     tridiagonal. A simpler conversion formula might be used, but then
 *     updating the Q matrix will be required and based if Q is generated
-*     or not this might complicate the story. 
-*      
+*     or not this might complicate the story.
+*
       IF( KD.EQ.1 ) THEN
           DO 50 I = 1, N
               D( I ) = DBLE( AB( ABDPOS, I ) )
@@ -444,7 +453,7 @@ C                 END IF
           RETURN
       END IF
 *
-*     Main code start here. 
+*     Main code start here.
 *     Reduce the hermitian band of A to a tridiagonal matrix.
 *
       THGRSIZ   = N
@@ -453,7 +462,7 @@ C                 END IF
       NBTILES   = CEILING( REAL(N)/REAL(KD) )
       STEPERCOL = CEILING( REAL(SHIFT)/REAL(GRSIZ) )
       THGRNB    = CEILING( REAL(N-1)/REAL(THGRSIZ) )
-*      
+*
       CALL ZLACPY( "A", KD+1, N, AB, LDAB, WORK( APOS ), LDA )
       CALL ZLASET( "A", KD,   N, ZERO, ZERO, WORK( AWPOS ), LDA )
 *
@@ -462,7 +471,7 @@ C                 END IF
 *
 #if defined(_OPENMP)
 !$OMP PARALLEL PRIVATE( TID, THGRID, BLKLASTIND )
-!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID ) 
+!$OMP$         PRIVATE( THED, I, M, K, ST, ED, STT, SWEEPID )
 !$OMP$         PRIVATE( MYID, TTYPE, COLPT, STIND, EDIND )
 !$OMP$         SHARED ( UPLO, WANTQ, INDV, INDTAU, HOUS, WORK)
 !$OMP$         SHARED ( N, KD, IB, NBTILES, LDA, LDV, INDA )
@@ -471,7 +480,7 @@ C                 END IF
 #endif
 *
 *     main bulge chasing loop
-*      
+*
       DO 100 THGRID = 1, THGRNB
           STT  = (THGRID-1)*THGRSIZ+1
           THED = MIN( (STT + THGRSIZ -1), (N-1))
@@ -482,7 +491,7 @@ C                 END IF
                   ST = STT
                   DO 130 SWEEPID = ST, ED
                       DO 140 K = 1, GRSIZ
-                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ) 
+                          MYID  = (I-SWEEPID)*(STEPERCOL*GRSIZ)
      $                           + (M-1)*GRSIZ + K
                           IF ( MYID.EQ.1 ) THEN
                               TTYPE = 1
@@ -508,17 +517,17 @@ C                 END IF
                           ENDIF
 *
 *                         Call the kernel
-*                             
+*
 #if defined(_OPENMP) &&  _OPENMP >= 201307
 
-                          IF( TTYPE.NE.1 ) THEN      
+                          IF( TTYPE.NE.1 ) THEN
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(in:WORK(MYID-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
@@ -526,20 +535,20 @@ C                 END IF
 !$OMP TASK DEPEND(in:WORK(MYID+SHIFT-1))
 !$OMP$     DEPEND(out:WORK(MYID))
                               TID      = OMP_GET_THREAD_NUM()
-                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                              CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                             STIND, EDIND, SWEEPID, N, KD, IB,
-     $                             WORK ( INDA ), LDA, 
+     $                             WORK ( INDA ), LDA,
      $                             HOUS( INDV ), HOUS( INDTAU ), LDV,
      $                             WORK( INDW + TID*KD ) )
 !$OMP END TASK
                           ENDIF
 #else
-                          CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE, 
+                          CALL ZHB2ST_KERNELS( UPLO, WANTQ, TTYPE,
      $                         STIND, EDIND, SWEEPID, N, KD, IB,
-     $                         WORK ( INDA ), LDA, 
+     $                         WORK ( INDA ), LDA,
      $                         HOUS( INDV ), HOUS( INDTAU ), LDV,
-     $                         WORK( INDW + TID*KD ) )
-#endif 
+     $                         WORK( INDW ) )
+#endif
                           IF ( BLKLASTIND.GE.(N-1) ) THEN
                               STT = STT + 1
                               EXIT
@@ -554,14 +563,14 @@ C                 END IF
 !$OMP END MASTER
 !$OMP END PARALLEL
 #endif
-*      
+*
 *     Copy the diagonal from A to D. Note that D is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
       DO 150 I = 1, N
           D( I ) = DBLE( WORK( DPOS+(I-1)*LDA ) )
   150 CONTINUE
-*      
+*
 *     Copy the off diagonal from A to E. Note that E is REAL thus only
 *     the Real part is needed, the imaginary part should be zero.
 *
@@ -575,11 +584,10 @@ C                 END IF
   170     CONTINUE
       ENDIF
 *
-      HOUS( 1 ) = LHMIN
       WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZHETRD_HB2ST
 *
       END
-      
+
diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f
index e1b2e1794..3e3bfa374 100644
--- a/lapack-netlib/SRC/zhetrd_he2hb.f
+++ b/lapack-netlib/SRC/zhetrd_he2hb.f
@@ -123,8 +123,8 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (LWORK)
-*>          On exit, if INFO = 0, or if LWORK=-1, 
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, or if LWORK = -1,
 *>          WORK(1) returns the size of LWORK.
 *> \endverbatim
 *>
@@ -132,7 +132,9 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK which should be calculated
-*>          by a workspace query. LWORK = MAX(1, LWORK_QUERY)
+*>          by a workspace query.
+*>          If N <= KD+1, LWORK >= 1, else LWORK = MAX(1, LWORK_QUERY).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
@@ -158,7 +160,7 @@
 *> \author Univ. of Colorado Denver 
 *> \author NAG Ltd. 
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrd_he2hb
 *
 *> \par Further Details:
 *  =====================
@@ -293,8 +295,12 @@
       INFO   = 0
       UPPER  = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
-      LWMIN  = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
-      
+      IF( N.LE.KD+1 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = ILAENV2STAGE( 4, 'ZHETRD_HE2HB', '', N, KD, -1, -1 )
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
diff --git a/lapack-netlib/SRC/zhetrf.f b/lapack-netlib/SRC/zhetrf.f
index 78d4f71b8..433887108 100644
--- a/lapack-netlib/SRC/zhetrf.f
+++ b/lapack-netlib/SRC/zhetrf.f
@@ -107,7 +107,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK. LWORK >= 1. For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *> \endverbatim
 *>
@@ -130,7 +130,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf
 *
 *> \par Further Details:
 *  =====================
@@ -227,7 +227,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -346,6 +346,7 @@
       END IF
 *
    40 CONTINUE
+*
       WORK( 1 ) = LWKOPT
       RETURN
 *
diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f
index 56722e7e6..381c87d51 100644
--- a/lapack-netlib/SRC/zhetrf_aa.f
+++ b/lapack-netlib/SRC/zhetrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -101,8 +101,10 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK. LWORK >= MAX(1,2*N). For optimum performance
-*>          LWORK >= N*(1+NB), where NB is the optimal blocksize.
+*>          The length of WORK.
+*>          LWORK >= 1, if N >= 1, and LWORK >= 2*N, otherwise.
+*>          For optimum performance LWORK >= N*(1+NB), where NB is
+*>          the optimal blocksize, returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
@@ -125,10 +127,10 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf_aa
 *
 *  =====================================================================
-      SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO)
+      SUBROUTINE ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -152,7 +154,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL      LQUERY, UPPER
-      INTEGER      J, LWKOPT
+      INTEGER      J, LWKMIN, LWKOPT
       INTEGER      NB, MJ, NJ, K1, K2, J1, J2, J3, JB
       COMPLEX*16   ALPHA
 *     ..
@@ -178,18 +180,25 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( N.LE.1 ) THEN
+         LWKMIN = 1
+         LWKOPT = 1
+      ELSE
+         LWKMIN = 2*N
+         LWKOPT = (NB+1)*N
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF( LWORK.LT.MAX( 1, 2*N ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
 *
       IF( INFO.EQ.0 ) THEN
-         LWKOPT = (NB+1)*N
          WORK( 1 ) = LWKOPT
       END IF
 *
@@ -202,11 +211,11 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
           RETURN
       ENDIF
       IPIV( 1 ) = 1
-      IF ( N.EQ.1 ) THEN
+      IF( N.EQ.1 ) THEN
          A( 1, 1 ) = DBLE( A( 1, 1 ) )
          RETURN
       END IF
diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f
index 92f1c09b3..bab13a99d 100644
--- a/lapack-netlib/SRC/zhetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
@@ -87,14 +87,14 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX*16 array, dimension (LTB)
+*>          TB is COMPLEX*16 array, dimension (MAX(1,LTB))
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
 *> \param[in] LTB
 *> \verbatim
 *>          LTB is INTEGER
-*>          The size of the array TB. LTB >= 4*N, internally
+*>          The size of the array TB. LTB >= MAX(1,4*N), internally
 *>          used to select NB such that LTB >= (3*NB+1)*N.
 *>
 *>          If LTB = -1, then a workspace query is assumed; the
@@ -121,14 +121,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 workspace of size LWORK
+*>          WORK is COMPLEX*16 workspace of size (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The size of WORK. LWORK >= N, internally used to select NB
-*>          such that LWORK >= N*NB.
+*>          The size of WORK. LWORK >= MAX(1,N), internally used to
+*>          select NB such that LWORK >= N*NB.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the
 *>          routine only calculates the optimal size of the WORK array,
@@ -152,7 +152,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16SYcomputational
+*> \ingroup hetrf_aa_2stage
 *
 *  =====================================================================
       SUBROUTINE ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV,
@@ -182,7 +182,7 @@
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
       INTEGER            I, J, K, I1, I2, TD
-      INTEGER            LDTB, NB, KB, JB, NT, IINFO
+      INTEGER            LWKOPT, LDTB, NB, KB, JB, NT, IINFO
       COMPLEX*16         PIV
 *     ..
 *     .. External Functions ..
@@ -212,9 +212,9 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF ( LTB .LT. 4*N .AND. .NOT.TQUERY ) THEN
+      ELSE IF( LTB.LT.MAX( 1, 4*N ) .AND. .NOT.TQUERY ) THEN
          INFO = -6
-      ELSE IF ( LWORK .LT. N .AND. .NOT.WQUERY ) THEN
+      ELSE IF( LWORK.LT.MAX( 1, N ) .AND. .NOT.WQUERY ) THEN
          INFO = -10
       END IF
 *
@@ -228,10 +228,10 @@
       NB = ILAENV( 1, 'ZHETRF_AA_2STAGE', UPLO, N, -1, -1, -1 )
       IF( INFO.EQ.0 ) THEN
          IF( TQUERY ) THEN
-            TB( 1 ) = (3*NB+1)*N
+            TB( 1 ) = MAX( 1, (3*NB+1)*N )
          END IF
          IF( WQUERY ) THEN
-            WORK( 1 ) = N*NB
+            WORK( 1 ) = MAX( 1, N*NB )
          END IF
       END IF
       IF( TQUERY .OR. WQUERY ) THEN
@@ -240,7 +240,7 @@
 *
 *     Quick return
 *
-      IF ( N.EQ.0 ) THEN
+      IF( N.EQ.0 ) THEN
          RETURN
       ENDIF
 *
@@ -392,7 +392,7 @@
                CALL ZGETRF( N-(J+1)*NB, NB, 
      $                      WORK, N,
      $                      IPIV( (J+1)*NB+1 ), IINFO )
-c               IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN
+c               IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN
 c                  INFO = IINFO+(J+1)*NB
 c               END IF
 *
@@ -587,7 +587,7 @@ c               END IF
                CALL ZGETRF( N-(J+1)*NB, NB, 
      $                      A( (J+1)*NB+1, J*NB+1 ), LDA,
      $                      IPIV( (J+1)*NB+1 ), IINFO )
-c               IF (IINFO.NE.0 .AND. INFO.EQ.0) THEN
+c               IF( IINFO.NE.0 .AND. INFO.EQ.0 ) THEN
 c                  INFO = IINFO+(J+1)*NB
 c               END IF
 *         
diff --git a/lapack-netlib/SRC/zhetrf_rk.f b/lapack-netlib/SRC/zhetrf_rk.f
index 73dd9f9d0..7c505fa4d 100644
--- a/lapack-netlib/SRC/zhetrf_rk.f
+++ b/lapack-netlib/SRC/zhetrf_rk.f
@@ -177,14 +177,14 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension ( MAX(1,LWORK) ).
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)).
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned
 *>          by ILAENV.
 *>
@@ -229,7 +229,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf_rk
 *
 *> \par Further Details:
 *  =====================
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZHETRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/zhetrf_rook.f b/lapack-netlib/SRC/zhetrf_rook.f
index e9de47248..a56349092 100644
--- a/lapack-netlib/SRC/zhetrf_rook.f
+++ b/lapack-netlib/SRC/zhetrf_rook.f
@@ -122,7 +122,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The length of WORK.  LWORK >=1.  For best performance
+*>          The length of WORK.  LWORK >= 1.  For best performance
 *>          LWORK >= N*NB, where NB is the block size returned by ILAENV.
 *>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
@@ -150,7 +150,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrf_rook
 *
 *> \par Further Details:
 *  =====================
diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f
index 384745c3a..1d932b866 100644
--- a/lapack-netlib/SRC/zhetri2.f
+++ b/lapack-netlib/SRC/zhetri2.f
@@ -88,16 +88,16 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX*16 array, dimension (N+NB+1)*(NB+3)
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK)).
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          WORK is size >= (N+NB+1)*(NB+3)
+*>          If N = 0, LWORK >= 1, else LWORK >= (N+NB+1)*(NB+3).
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>           calculates:
+*>          calculates:
 *>              - the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array,
 *>              - and no error message related to LWORK is issued by XERBLA.
@@ -120,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetri2
 *
 *  =====================================================================
       SUBROUTINE ZHETRI2( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
@@ -159,9 +159,13 @@
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+*
 *     Get blocksize
+*
       NBMAX = ILAENV( 1, 'ZHETRF', UPLO, N, -1, -1, -1 )
-      IF ( NBMAX .GE. N ) THEN
+      IF( N.EQ.0 ) THEN
+         MINSIZE = 1
+      ELSE IF( NBMAX.GE.N ) THEN
          MINSIZE = N
       ELSE
          MINSIZE = (N+NBMAX+1)*(NBMAX+3)
@@ -173,28 +177,29 @@
          INFO = -2
       ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
          INFO = -4
-      ELSE IF (LWORK .LT. MINSIZE .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.MINSIZE .AND. .NOT.LQUERY ) THEN
          INFO = -7
       END IF
-*
-*     Quick return if possible
-*
 *
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRI2', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         WORK(1)=MINSIZE
+         WORK( 1 ) = MINSIZE
          RETURN
       END IF
+*
+*     Quick return if possible
+*
       IF( N.EQ.0 )
      $   RETURN
 
-      IF( NBMAX .GE. N ) THEN
+      IF( NBMAX.GE.N ) THEN
          CALL ZHETRI( UPLO, N, A, LDA, IPIV, WORK, INFO )
       ELSE
          CALL ZHETRI2X( UPLO, N, A, LDA, IPIV, WORK, NBMAX, INFO )
       END IF
+*
       RETURN
 *
 *     End of ZHETRI2
diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f
index 06ac1fd28..b7a1f7f07 100644
--- a/lapack-netlib/SRC/zhetrs_aa.f
+++ b/lapack-netlib/SRC/zhetrs_aa.f
@@ -106,7 +106,13 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*>          The dimension of the array WORK.
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else LWORK >= 3*N-2.
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates the minimal size of the WORK array, returns
+*>          this value as the first entry of the WORK array, and no error
+*>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
 *> \param[out] INFO
@@ -124,7 +130,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrs_aa
 *
 *  =====================================================================
       SUBROUTINE ZHETRS_AA( UPLO, N, NRHS, A, LDA, IPIV, B, LDB,
@@ -152,7 +158,7 @@
 *     ..
 *     .. Local Scalars ..
       LOGICAL            LQUERY, UPPER
-      INTEGER            K, KP, LWKOPT
+      INTEGER            K, KP, LWKMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -162,13 +168,19 @@
       EXTERNAL           ZGTSV, ZSWAP, ZTRSM, ZLACGV, ZLACPY, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
-      INTRINSIC          MAX
+      INTRINSIC          MIN, MAX
 *     ..
 *     .. Executable Statements ..
 *
       INFO = 0
       UPPER = LSAME( UPLO, 'U' )
       LQUERY = ( LWORK.EQ.-1 )
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWKMIN = 1
+      ELSE
+         LWKMIN = 3*N-2
+      END IF
+*
       IF( .NOT.UPPER .AND. .NOT.LSAME( UPLO, 'L' ) ) THEN
          INFO = -1
       ELSE IF( N.LT.0 ) THEN
@@ -179,21 +191,20 @@
          INFO = -5
       ELSE IF( LDB.LT.MAX( 1, N ) ) THEN
          INFO = -8
-      ELSE IF( LWORK.LT.MAX( 1, 3*N-2 ) .AND. .NOT.LQUERY ) THEN
+      ELSE IF( LWORK.LT.LWKMIN .AND. .NOT.LQUERY ) THEN
          INFO = -10
       END IF
       IF( INFO.NE.0 ) THEN
          CALL XERBLA( 'ZHETRS_AA', -INFO )
          RETURN
       ELSE IF( LQUERY ) THEN
-         LWKOPT = (3*N-2)
-         WORK( 1 ) = LWKOPT
+         WORK( 1 ) = LWKMIN
          RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( N.EQ.0 .OR. NRHS.EQ.0 )
+      IF( MIN( N, NRHS ).EQ.0 )
      $   RETURN
 *
       IF( UPPER ) THEN
diff --git a/lapack-netlib/SRC/zhpevd.f b/lapack-netlib/SRC/zhpevd.f
index 7625c8fe8..5260aaf14 100644
--- a/lapack-netlib/SRC/zhpevd.f
+++ b/lapack-netlib/SRC/zhpevd.f
@@ -41,12 +41,6 @@
 *> a complex Hermitian matrix A in packed storage.  If eigenvectors are
 *> desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zhpgv.f b/lapack-netlib/SRC/zhpgv.f
index 72876b6e4..b92168555 100644
--- a/lapack-netlib/SRC/zhpgv.f
+++ b/lapack-netlib/SRC/zhpgv.f
@@ -144,7 +144,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhpgvd.f b/lapack-netlib/SRC/zhpgvd.f
index e96e39738..dfe92067c 100644
--- a/lapack-netlib/SRC/zhpgvd.f
+++ b/lapack-netlib/SRC/zhpgvd.f
@@ -44,12 +44,6 @@
 *> positive definite.
 *> If eigenvectors are desired, it uses a divide and conquer algorithm.
 *>
-*> The divide and conquer algorithm makes very mild assumptions about
-*> floating point arithmetic. It will work on machines with a guard
-*> digit in add/subtract, or on those binary machines without guard
-*> digits which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or
-*> Cray-2. It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
@@ -205,7 +199,7 @@
 *>                    i off-diagonal elements of an intermediate
 *>                    tridiagonal form did not convergeto zero;
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zhpgvx.f b/lapack-netlib/SRC/zhpgvx.f
index 94d7f7733..de75b486b 100644
--- a/lapack-netlib/SRC/zhpgvx.f
+++ b/lapack-netlib/SRC/zhpgvx.f
@@ -250,7 +250,7 @@
 *>                    i eigenvectors failed to converge.  Their indices
 *>                    are stored in array IFAIL.
 *>             > N:   if INFO = N + i, for 1 <= i <= n, then the leading
-*>                    minor of order i of B is not positive definite.
+*>                    principal minor of order i of B is not positive.
 *>                    The factorization of B could not be completed and
 *>                    no eigenvalues or eigenvectors were computed.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zla_gbrfsx_extended.f b/lapack-netlib/SRC/zla_gbrfsx_extended.f
index fe4d635b1..2f57b7682 100644
--- a/lapack-netlib/SRC/zla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/zla_gbrfsx_extended.f
@@ -651,7 +651,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_gerfsx_extended.f b/lapack-netlib/SRC/zla_gerfsx_extended.f
index 9d618f294..22e45c5a9 100644
--- a/lapack-netlib/SRC/zla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/zla_gerfsx_extended.f
@@ -636,7 +636,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, (1.0D+0,0.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_herfsx_extended.f b/lapack-netlib/SRC/zla_herfsx_extended.f
index a55dd9431..689460a02 100644
--- a/lapack-netlib/SRC/zla_herfsx_extended.f
+++ b/lapack-netlib/SRC/zla_herfsx_extended.f
@@ -655,7 +655,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_porfsx_extended.f b/lapack-netlib/SRC/zla_porfsx_extended.f
index 12e05e049..e853494fc 100644
--- a/lapack-netlib/SRC/zla_porfsx_extended.f
+++ b/lapack-netlib/SRC/zla_porfsx_extended.f
@@ -626,7 +626,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF (Y_PREC_STATE .LT. EXTRA_Y) THEN
                CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zla_porpvgrw.f b/lapack-netlib/SRC/zla_porpvgrw.f
index 9b381a072..897589aa0 100644
--- a/lapack-netlib/SRC/zla_porpvgrw.f
+++ b/lapack-netlib/SRC/zla_porpvgrw.f
@@ -142,9 +142,9 @@
 *     .. Executable Statements ..
       UPPER = LSAME( 'Upper', UPLO )
 *
-*     DPOTRF will have factored only the NCOLSxNCOLS leading minor, so
-*     we restrict the growth search to that minor and use only the first
-*     2*NCOLS workspace entries.
+*     DPOTRF will have factored only the NCOLSxNCOLS leading submatrix,
+*     so we restrict the growth search to that submatrix and use only
+*     the first 2*NCOLS workspace entries.
 *
       RPVGRW = 1.0D+0
       DO I = 1, 2*NCOLS
diff --git a/lapack-netlib/SRC/zla_syrfsx_extended.f b/lapack-netlib/SRC/zla_syrfsx_extended.f
index d6c241499..fb1b9e2d7 100644
--- a/lapack-netlib/SRC/zla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/zla_syrfsx_extended.f
@@ -655,7 +655,7 @@
             PREVNORMDX = NORMDX
             PREV_DZ_Z = DZ_Z
 *
-*           Update soluton.
+*           Update solution.
 *
             IF ( Y_PREC_STATE .LT. EXTRA_Y ) THEN
                CALL ZAXPY( N, DCMPLX(1.0D+0), DY, 1, Y(1,J), 1 )
diff --git a/lapack-netlib/SRC/zlaed7.f b/lapack-netlib/SRC/zlaed7.f
index 83f32d8b8..86e5ec6b5 100644
--- a/lapack-netlib/SRC/zlaed7.f
+++ b/lapack-netlib/SRC/zlaed7.f
@@ -363,7 +363,7 @@
             RETURN
          END IF
 *
-*     Prepare the INDXQ sorting premutation.
+*     Prepare the INDXQ sorting permutation.
 *
          N1 = K
          N2 = N - K
diff --git a/lapack-netlib/SRC/zlaed8.f b/lapack-netlib/SRC/zlaed8.f
index 995a673de..003725820 100644
--- a/lapack-netlib/SRC/zlaed8.f
+++ b/lapack-netlib/SRC/zlaed8.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+*       SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
 *                          Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
 *                          GIVCOL, GIVNUM, INFO )
 *
@@ -29,7 +29,7 @@
 *       .. Array Arguments ..
 *       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
 *      $                   INDXQ( * ), PERM( * )
-*       DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+*       DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
 *      $                   Z( * )
 *       COMPLEX*16         Q( LDQ, * ), Q2( LDQ2, * )
 *       ..
@@ -122,9 +122,9 @@
 *>         destroyed during the updating process.
 *> \endverbatim
 *>
-*> \param[out] DLAMDA
+*> \param[out] DLAMBDA
 *> \verbatim
-*>          DLAMDA is DOUBLE PRECISION array, dimension (N)
+*>          DLAMBDA is DOUBLE PRECISION array, dimension (N)
 *>         Contains a copy of the first K eigenvalues which will be used
 *>         by DLAED3 to form the secular equation.
 *> \endverbatim
@@ -222,7 +222,7 @@
 *> \ingroup complex16OTHERcomputational
 *
 *  =====================================================================
-      SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMDA,
+      SUBROUTINE ZLAED8( K, N, QSIZ, Q, LDQ, D, RHO, CUTPNT, Z, DLAMBDA,
      $                   Q2, LDQ2, W, INDXP, INDX, INDXQ, PERM, GIVPTR,
      $                   GIVCOL, GIVNUM, INFO )
 *
@@ -237,7 +237,7 @@
 *     .. Array Arguments ..
       INTEGER            GIVCOL( 2, * ), INDX( * ), INDXP( * ),
      $                   INDXQ( * ), PERM( * )
-      DOUBLE PRECISION   D( * ), DLAMDA( * ), GIVNUM( 2, * ), W( * ),
+      DOUBLE PRECISION   D( * ), DLAMBDA( * ), GIVNUM( 2, * ), W( * ),
      $                   Z( * )
       COMPLEX*16         Q( LDQ, * ), Q2( LDQ2, * )
 *     ..
@@ -322,14 +322,14 @@
          INDXQ( I ) = INDXQ( I ) + CUTPNT
    20 CONTINUE
       DO 30 I = 1, N
-         DLAMDA( I ) = D( INDXQ( I ) )
+         DLAMBDA( I ) = D( INDXQ( I ) )
          W( I ) = Z( INDXQ( I ) )
    30 CONTINUE
       I = 1
       J = CUTPNT + 1
-      CALL DLAMRG( N1, N2, DLAMDA, 1, 1, INDX )
+      CALL DLAMRG( N1, N2, DLAMBDA, 1, 1, INDX )
       DO 40 I = 1, N
-         D( I ) = DLAMDA( INDX( I ) )
+         D( I ) = DLAMBDA( INDX( I ) )
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
@@ -438,7 +438,7 @@
          ELSE
             K = K + 1
             W( K ) = Z( JLAM )
-            DLAMDA( K ) = D( JLAM )
+            DLAMBDA( K ) = D( JLAM )
             INDXP( K ) = JLAM
             JLAM = J
          END IF
@@ -450,19 +450,19 @@
 *
       K = K + 1
       W( K ) = Z( JLAM )
-      DLAMDA( K ) = D( JLAM )
+      DLAMBDA( K ) = D( JLAM )
       INDXP( K ) = JLAM
 *
   100 CONTINUE
 *
-*     Sort the eigenvalues and corresponding eigenvectors into DLAMDA
+*     Sort the eigenvalues and corresponding eigenvectors into DLAMBDA
 *     and Q2 respectively.  The eigenvalues/vectors which were not
-*     deflated go into the first K slots of DLAMDA and Q2 respectively,
+*     deflated go into the first K slots of DLAMBDA and Q2 respectively,
 *     while those which were deflated go into the last N - K slots.
 *
       DO 110 J = 1, N
          JP = INDXP( J )
-         DLAMDA( J ) = D( JP )
+         DLAMBDA( J ) = D( JP )
          PERM( J ) = INDXQ( INDX( JP ) )
          CALL ZCOPY( QSIZ, Q( 1, PERM( J ) ), 1, Q2( 1, J ), 1 )
   110 CONTINUE
@@ -471,7 +471,7 @@
 *     into the last N - K slots of D and Q respectively.
 *
       IF( K.LT.N ) THEN
-         CALL DCOPY( N-K, DLAMDA( K+1 ), 1, D( K+1 ), 1 )
+         CALL DCOPY( N-K, DLAMBDA( K+1 ), 1, D( K+1 ), 1 )
          CALL ZLACPY( 'A', QSIZ, N-K, Q2( 1, K+1 ), LDQ2, Q( 1, K+1 ),
      $                LDQ )
       END IF
diff --git a/lapack-netlib/SRC/zlals0.f b/lapack-netlib/SRC/zlals0.f
index 7a7310042..79c0cf5e4 100644
--- a/lapack-netlib/SRC/zlals0.f
+++ b/lapack-netlib/SRC/zlals0.f
@@ -392,6 +392,11 @@
      $                ( POLES( I, 2 ).EQ.ZERO ) ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      RWORK( I ) = POLES( I, 2 )*Z( I ) /
      $                            ( DLAMC3( POLES( I, 2 ), DSIGJ )-
      $                            DIFLJ ) / ( POLES( I, 2 )+DJ )
@@ -470,6 +475,11 @@
                   IF( Z( J ).EQ.ZERO ) THEN
                      RWORK( I ) = ZERO
                   ELSE
+*
+*                    Use calls to the subroutine DLAMC3 to enforce the
+*                    parentheses (x+y)+z. The goal is to prevent
+*                    optimizing compilers from doing x+(y+z).
+*
                      RWORK( I ) = Z( J ) / ( DLAMC3( DSIGJ, -POLES( I+1,
      $                            2 ) )-DIFR( I, 1 ) ) /
      $                            ( DSIGJ+POLES( I, 1 ) ) / DIFR( I, 2 )
diff --git a/lapack-netlib/SRC/zlalsa.f b/lapack-netlib/SRC/zlalsa.f
index 73ccf3a8b..d419598d2 100644
--- a/lapack-netlib/SRC/zlalsa.f
+++ b/lapack-netlib/SRC/zlalsa.f
@@ -42,9 +42,9 @@
 *>
 *> \verbatim
 *>
-*> ZLALSA is an itermediate step in solving the least squares problem
+*> ZLALSA is an intermediate step in solving the least squares problem
 *> by computing the SVD of the coefficient matrix in compact form (The
-*> singular vectors are computed as products of simple orthorgonal
+*> singular vectors are computed as products of simple orthogonal
 *> matrices.).
 *>
 *> If ICOMPQ = 0, ZLALSA applies the inverse of the left singular vector
diff --git a/lapack-netlib/SRC/zlalsd.f b/lapack-netlib/SRC/zlalsd.f
index dca308e56..1d7358aa9 100644
--- a/lapack-netlib/SRC/zlalsd.f
+++ b/lapack-netlib/SRC/zlalsd.f
@@ -48,12 +48,6 @@
 *> problem; in this case a minimum norm solution is returned.
 *> The actual singular values are returned in D in ascending order.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray XMP, Cray YMP, Cray C 90, or Cray 2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f
index 4abefa434..59a0a5558 100644
--- a/lapack-netlib/SRC/zlamswlq.f
+++ b/lapack-netlib/SRC/zlamswlq.f
@@ -127,17 +127,20 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
-*>          If SIDE = 'L', LWORK >= max(1,NB) * MB;
-*>          if SIDE = 'R', LWORK >= max(1,M) * MB.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,NB*MB).
+*>          If SIDE = 'R', LWORK >= max(1,M*MB).
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -189,92 +192,103 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamswlq
+*>
 *  =====================================================================
       SUBROUTINE ZLAMSWLQ( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $    LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), C(LDC, * ),
-     $      T( LDT, * )
+      COMPLEX*16         A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL    ZTPMLQT, ZGEMLQT, XERBLA
+      EXTERNAL           ZTPMLQT, ZGEMLQT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * MB
       ELSE
         LW = M * MB
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( K.LT.0 ) THEN
         INFO = -5
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
         INFO = -4
-      ELSE IF( K.LT.MB .OR. MB.LT.1) THEN
+      ELSE IF( K.LT.MB .OR. MB.LT.1 ) THEN
         INFO = -6
       ELSE IF( LDA.LT.MAX( 1, K ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, MB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, MB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
+      IF( INFO.EQ.0 ) THEN
+        WORK( 1 ) = LWMIN
+      END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLAMSWLQ', -INFO )
-        WORK(1) = LW
         RETURN
-      ELSE IF (LQUERY) THEN
-        WORK(1) = LW
+      ELSE IF( LQUERY ) THEN
         RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
 *
       IF((NB.LE.K).OR.(NB.GE.MAX(M,N,K))) THEN
         CALL ZGEMLQT( SIDE, TRANS, M, N, K, MB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
       END IF
 *
@@ -403,7 +417,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLAMSWLQ
diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f
index 5030cb75f..03770c06e 100644
--- a/lapack-netlib/SRC/zlamtsqr.f
+++ b/lapack-netlib/SRC/zlamtsqr.f
@@ -128,22 +128,24 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
+*>
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK.
+*>          If MIN(M,N,K) = 0, LWORK >= 1.
+*>          If SIDE = 'L', LWORK >= max(1,N*NB).
+*>          If SIDE = 'R', LWORK >= max(1,MB*NB).
 *>
-*>          If SIDE = 'L', LWORK >= max(1,N)*NB;
-*>          if SIDE = 'R', LWORK >= max(1,MB)*NB.
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -191,46 +193,50 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup lamtsqr
+*>
 *  =====================================================================
       SUBROUTINE ZLAMTSQR( SIDE, TRANS, M, N, K, MB, NB, A, LDA, T,
-     $        LDT, C, LDC, WORK, LWORK, INFO )
+     $                     LDT, C, LDC, WORK, LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *
 *     .. Scalar Arguments ..
-      CHARACTER         SIDE, TRANS
-      INTEGER           INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
+      CHARACTER          SIDE, TRANS
+      INTEGER            INFO, LDA, M, N, K, MB, NB, LDT, LWORK, LDC
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), C(LDC, * ),
-     $                T( LDT, * )
+      COMPLEX*16         A( LDA, * ), WORK( * ), C( LDC, * ),
+     $                   T( LDT, * )
 *     ..
 *
 * =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LEFT, RIGHT, TRAN, NOTRAN, LQUERY
-      INTEGER    I, II, KK, LW, CTR, Q
+      LOGICAL            LEFT, RIGHT, TRAN, NOTRAN, LQUERY
+      INTEGER            I, II, KK, LW, CTR, Q, MINMNK, LWMIN
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. External Subroutines ..
-      EXTERNAL   ZGEMQRT, ZTPMQRT, XERBLA
+      EXTERNAL           ZGEMQRT, ZTPMQRT, XERBLA
 *     ..
 *     .. Executable Statements ..
 *
 *     Test the input arguments
 *
-      LQUERY  = LWORK.LT.0
+      INFO = 0
+      LQUERY  = ( LWORK.EQ.-1 )
       NOTRAN  = LSAME( TRANS, 'N' )
       TRAN    = LSAME( TRANS, 'C' )
       LEFT    = LSAME( SIDE, 'L' )
       RIGHT   = LSAME( SIDE, 'R' )
-      IF (LEFT) THEN
+      IF( LEFT ) THEN
         LW = N * NB
         Q = M
       ELSE
@@ -238,11 +244,17 @@
         Q = N
       END IF
 *
-      INFO = 0
+      MINMNK = MIN( M, N, K )
+      IF( MINMNK.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = MAX( 1, LW )
+      END IF
+*
       IF( .NOT.LEFT .AND. .NOT.RIGHT ) THEN
-         INFO = -1
+        INFO = -1
       ELSE IF( .NOT.TRAN .AND. .NOT.NOTRAN ) THEN
-         INFO = -2
+        INFO = -2
       ELSE IF( M.LT.K ) THEN
         INFO = -3
       ELSE IF( N.LT.0 ) THEN
@@ -253,38 +265,38 @@
         INFO = -7
       ELSE IF( LDA.LT.MAX( 1, Q ) ) THEN
         INFO = -9
-      ELSE IF( LDT.LT.MAX( 1, NB) ) THEN
+      ELSE IF( LDT.LT.MAX( 1, NB ) ) THEN
         INFO = -11
       ELSE IF( LDC.LT.MAX( 1, M ) ) THEN
-         INFO = -13
-      ELSE IF(( LWORK.LT.MAX(1,LW)).AND.(.NOT.LQUERY)) THEN
+        INFO = -13
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -15
       END IF
 *
-*     Determine the block size if it is tall skinny or short and wide
-*
-      IF( INFO.EQ.0)  THEN
-          WORK(1) = LW
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLAMTSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N,K).EQ.0 ) THEN
+      IF( MINMNK.EQ.0 ) THEN
         RETURN
       END IF
+*
+*     Determine the block size if it is tall skinny or short and wide
 *
       IF((MB.LE.K).OR.(MB.GE.MAX(M,N,K))) THEN
         CALL ZGEMQRT( SIDE, TRANS, M, N, K, NB, A, LDA,
-     $        T, LDT, C, LDC, WORK, INFO)
+     $        T, LDT, C, LDC, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
       IF(LEFT.AND.NOTRAN) THEN
 *
@@ -410,7 +422,7 @@
 *
       END IF
 *
-      WORK(1) = LW
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLAMTSQR
diff --git a/lapack-netlib/SRC/zlaqp2rk.c b/lapack-netlib/SRC/zlaqp2rk.c
new file mode 100644
index 000000000..0d38e71fb
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp2rk.c
@@ -0,0 +1,947 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static integer c__1 = 1;
+
+/* Subroutine */ int zlaqp2rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *kmax, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublecomplex *a, integer *lda, 
+	integer *k, doublereal *maxc2nrmk, doublereal *relmaxc2nrmk, integer *
+	jpiv, doublecomplex *tau, doublereal *vn1, doublereal *vn2, 
+	doublecomplex *work, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, i__1, i__2, i__3;
+    doublereal d__1;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublecomplex aikk;
+    doublereal temp, temp2;
+    integer i__, j;
+    doublereal tol3z;
+    integer jmaxc2nrm, itemp;
+    extern /* Subroutine */ int zlarf_(char *, integer *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *, doublecomplex *);
+    integer minmnfact;
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *);
+    doublereal myhugeval;
+    integer minmnupdt;
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    integer kk;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *);
+    doublereal taunan;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --work;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/*     MINMNUPDT is the smallest dimension */
+/*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which */
+/*     contains the submatrices A(IOFFSET+1:M,1:N) and */
+/*     B(IOFFSET+1:M,1:NRHS) as column blocks. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *kmax = f2cmin(*kmax,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute the factorization, KK is the lomn loop index. */
+
+    i__1 = *kmax;
+    for (kk = 1; kk <= i__1; ++kk) {
+
+	i__ = *ioffset + kk;
+
+	if (i__ == 1) {
+
+/*           ============================================================ */
+
+/*           We are at the first column of the original whole matrix A, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+/*           ============================================================ */
+
+	} else {
+
+/*           ============================================================ */
+
+/*           Determine the pivot column in KK-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__2 = *n - kk + 1;
+	    kp = kk - 1 + idamax_(&i__2, &vn1[kk], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,KK:N) in step KK. */
+/*           RELMAXC2NRMK  will be computed later, after somecondition */
+/*           checks on MAXC2NRMK. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains NaN, and set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*info = *k + kp;
+
+/*               Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*               Array TAU(K+1:MINMNFACT) is not set and contains */
+/*               undefined elements. */
+
+		return 0;
+	    }
+
+/*           ============================================================ */
+
+/*           Quick return, if the submatrix A(I:M,KK:N) is */
+/*           a zero matrix. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+/*              Set K, the number of factorized columns. */
+/*              that are not zero. */
+
+		*k = kk - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0., tau[i__3].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,KK:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + kk - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL >= ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+/*              Set K, the number of factorized columns. */
+
+		*k = kk - 1;
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO. */
+
+		i__2 = minmnfact;
+		for (j = kk; j <= i__2; ++j) {
+		    i__3 = j;
+		    tau[i__3].r = 0., tau[i__3].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,KK:N): */
+/*        1) swap the KK-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) copy the KK-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than KK in the next loop step.) */
+/*        3) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A, not the block A(1:M,1:N). */
+
+	if (kp != kk) {
+	    zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[kk * a_dim1 + 1], &c__1);
+	    vn1[kp] = vn1[kk];
+	    vn2[kp] = vn2[kk];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[kk];
+	    jpiv[kk] = itemp;
+	}
+
+/*        Generate elementary reflector H(KK) using the column A(I:M,KK), */
+/*        if the column has more than one element, otherwise */
+/*        the elementary reflector would be an identity matrix, */
+/*        and TAU(KK) = CZERO. */
+
+	if (i__ < *m) {
+	    i__2 = *m - i__ + 1;
+	    zlarfg_(&i__2, &a[i__ + kk * a_dim1], &a[i__ + 1 + kk * a_dim1], &
+		    c__1, &tau[kk]);
+	} else {
+	    i__2 = kk;
+	    tau[i__2].r = 0., tau[i__2].i = 0.;
+	}
+
+/*        Check if TAU(KK) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(KK) for Inf, */
+/*        since ZLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by ZLARFG can contain Inf, which requires */
+/*        TAU(KK) to contain NaN. Therefore, this case of generating Inf */
+/*        by ZLARFG is covered by checking TAU(KK) for NaN. */
+
+	i__2 = kk;
+	d__1 = tau[i__2].r;
+	if (disnan_(&d__1)) {
+	    i__2 = kk;
+	    taunan = tau[i__2].r;
+	} else /* if(complicated condition) */ {
+	    d__1 = d_imag(&tau[kk]);
+	    if (disnan_(&d__1)) {
+		taunan = d_imag(&tau[kk]);
+	    } else {
+		taunan = 0.;
+	    }
+	}
+
+	if (disnan_(&taunan)) {
+	    *k = kk - 1;
+	    *info = kk;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           Array TAU(KK:MINMNFACT) is not set and contains */
+/*           undefined elements, except the first element TAU(KK) = NaN. */
+
+	    return 0;
+	}
+
+/*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left. */
+/*        ( If M >= N, then at KK = N there is no residual matrix, */
+/*         i.e. no columns of A to update, only columns of B. */
+/*         If M < N, then at KK = M-IOFFSET, I = M and we have a */
+/*         one-row residual matrix in A and the elementary */
+/*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update */
+/*         is needed for the residual matrix in A and the */
+/*         right-hand-side-matrix in B. */
+/*         Therefore, we update only if */
+/*         KK < MINMNUPDT = f2cmin(M-IOFFSET, N+NRHS) */
+/*         condition is satisfied, not only KK < N+NRHS ) */
+
+	if (kk < minmnupdt) {
+	    i__2 = i__ + kk * a_dim1;
+	    aikk.r = a[i__2].r, aikk.i = a[i__2].i;
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = 1., a[i__2].i = 0.;
+	    i__2 = *m - i__ + 1;
+	    i__3 = *n + *nrhs - kk;
+	    d_cnjg(&z__1, &tau[kk]);
+	    zlarf_("Left", &i__2, &i__3, &a[i__ + kk * a_dim1], &c__1, &z__1, 
+		    &a[i__ + (kk + 1) * a_dim1], lda, &work[1]);
+	    i__2 = i__ + kk * a_dim1;
+	    a[i__2].r = aikk.r, a[i__2].i = aikk.i;
+	}
+
+	if (kk < minmnfact) {
+
+/*           Update the partial column 2-norms for the residual matrix, */
+/*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e. */
+/*           when KK < f2cmin(M-IOFFSET, N). */
+
+	    i__2 = *n;
+	    for (j = kk + 1; j <= i__2; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+/* Computing 2nd power */
+		    d__1 = z_abs(&a[i__ + j * a_dim1]) / vn1[j];
+		    temp = 1. - d__1 * d__1;
+		    temp = f2cmax(temp,0.);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    Compute the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by explicitly computing it, */
+/*                    and store it in both partial 2-norm vector VN1 */
+/*                    and exact column 2-norm vector VN2. */
+
+			i__3 = *m - i__;
+			vn1[j] = dznrm2_(&i__3, &a[i__ + 1 + j * a_dim1], &
+				c__1);
+			vn2[j] = vn1[j];
+
+		    } else {
+
+/*                    Update the column 2-norm for the partial */
+/*                    column A(I+1:M,J) by removing one */
+/*                    element A(I,J) and store it in partial */
+/*                    2-norm vector VN1. */
+
+			vn1[j] *= sqrt(temp);
+
+		    }
+		}
+	    }
+
+	}
+
+/*     End factorization loop */
+
+    }
+
+/*     If we reached this point, all colunms have been factorized, */
+/*     i.e. no condition was triggered to exit the routine. */
+/*     Set the number of factorized columns. */
+
+    *k = *kmax;
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before */
+/*     we return. */
+
+    if (*k < minmnfact) {
+
+	i__1 = *n - *k;
+	jmaxc2nrm = *k + idamax_(&i__1, &vn1[*k + 1], &c__1);
+	*maxc2nrmk = vn1[jmaxc2nrm];
+
+	if (*k == 0) {
+	    *relmaxc2nrmk = 1.;
+	} else {
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+	}
+
+    } else {
+	*maxc2nrmk = 0.;
+	*relmaxc2nrmk = 0.;
+    }
+
+/*     We reached the end of the loop, i.e. all KMAX columns were */
+/*     factorized, set TAUs corresponding to the columns that were */
+/*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO. */
+
+    i__1 = minmnfact;
+    for (j = *k + 1; j <= i__1; ++j) {
+	i__2 = j;
+	tau[i__2].r = 0., tau[i__2].i = 0.;
+    }
+
+    return 0;
+
+/*     End of ZLAQP2RK */
+
+} /* zlaqp2rk_ */
+
diff --git a/lapack-netlib/SRC/zlaqp2rk.f b/lapack-netlib/SRC/zlaqp2rk.f
new file mode 100644
index 000000000..f6bf555c2
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp2rk.f
@@ -0,0 +1,726 @@
+*> \brief \b ZLAQP2RK computes truncated QR factorization with column pivoting of a complex matrix block using Level 2 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLAQP2RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlaqp2rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlaqp2rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlaqp2rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+*     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+*     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+*     $                     INFO )
+*      IMPLICIT NONE
+*
+*     .. Scalar Arguments ..
+*      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            JPIV( * )
+*      DOUBLE PRECISION   VN1( * ), VN2( * )
+*      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*     $
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLAQP2RK computes a truncated (rank K) or full rank Householder QR
+*> factorization with column pivoting of the complex matrix
+*> block A(IOFFSET+1:M,1:N) as
+*>
+*>   A * P(K) = Q(K) * R(K).
+*>
+*> The routine uses Level 2 BLAS. The block A(1:IOFFSET,1:N)
+*> is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides matrix block B
+*> stored in A(IOFFSET+1:M,N+1:N+NRHS) with Q(K)**H * B.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] KMAX
+*> \verbatim
+*>          KMAX is INTEGER
+*>
+*>          The first factorization stopping criterion. KMAX >= 0.
+*>
+*>          The maximum number of columns of the matrix A to factorize,
+*>          i.e. the maximum factorization rank.
+*>
+*>          a) If KMAX >= min(M-IOFFSET,N), then this stopping
+*>                criterion is not used, factorize columns
+*>                depending on ABSTOL and RELTOL.
+*>
+*>          b) If KMAX = 0, then this stopping criterion is
+*>             satisfied on input and the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The second factorization stopping criterion.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The third factorization stopping criterion.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on KMAX and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine ZGEQP3RK. 1 <= KP1 <= N_orig_mat.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine ZGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:K) below
+*>             the diagonal together with the array TAU represent
+*>             the unitary matrix Q(K) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:K) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,K+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,K+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(K)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] K
+*> \verbatim
+*>          K is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R. 0 <= K <= min(M-IOFFSET,KMAX,N).
+*>
+*>          K also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank K. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank K) to the maximum column 2-norm of the
+*>          whole original matrix A. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (N-1)
+*>          Used in ZLARF subroutine to apply an elementary
+*>          reflector from the left.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step K+1 ( when K columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             K                  is set to the number of
+*>                                   factorized columns without
+*>                                   exception.
+*>             MAXC2NRMK          is set to NaN.
+*>             RELMAXC2NRMK       is set to NaN.
+*>             TAU(K+1:min(M,N))  is not set and contains undefined
+*>                                   elements. If j_1=K+1, TAU(K+1)
+*>                                   may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the factorization
+*>             step K+1 ( when K columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp2rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZLAQP2RK( M, N, NRHS, IOFFSET, KMAX, ABSTOL, RELTOL,
+     $                     KP1, MAXC2NRM, A, LDA, K, MAXC2NRMK,
+     $                     RELMAXC2NRMK, JPIV, TAU, VN1, VN2, WORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, IOFFSET, KP1, K, KMAX, LDA, M, N, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            JPIV( * )
+      DOUBLE PRECISION   VN1( * ), VN2( * )
+      COMPLEX*16         A( LDA, * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      COMPLEX*16         CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, ITEMP, J, JMAXC2NRM, KK, KP, MINMNFACT,
+     $                   MINMNUPDT
+      DOUBLE PRECISION   HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX*16         AIKK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLARF, ZLARFG, ZSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DZNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+*     MINMNUPDT is the smallest dimension
+*     of the subarray A(IOFFSET+1:M,1:N+NRHS) to be udated, which
+*     contains the submatrices A(IOFFSET+1:M,1:N) and
+*     B(IOFFSET+1:M,1:NRHS) as column blocks.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      KMAX = MIN( KMAX, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute the factorization, KK is the lomn loop index.
+*
+      DO KK = 1, KMAX
+*
+         I = IOFFSET + KK
+*
+         IF( I.EQ.1 ) THEN
+*
+*           ============================================================
+*
+*           We are at the first column of the original whole matrix A,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+*           ============================================================
+*
+         ELSE
+*
+*           ============================================================
+*
+*           Determine the pivot column in KK-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( KK-1 ) + IDAMAX( N-KK+1, VN1( KK ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,KK:N) in step KK.
+*           RELMAXC2NRMK  will be computed later, after somecondition
+*           checks on MAXC2NRMK.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains NaN, and set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+                K = KK - 1
+                INFO = K + KP
+*
+*               Set RELMAXC2NRMK to NaN.
+*
+                RELMAXC2NRMK = MAXC2NRMK
+*
+*               Array TAU(K+1:MINMNFACT) is not set and contains
+*               undefined elements.
+*
+               RETURN
+            END IF
+*
+*           ============================================================
+*
+*           Quick return, if the submatrix A(I:M,KK:N) is
+*           a zero matrix.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+*              Set K, the number of factorized columns.
+*              that are not zero.
+*
+               K = KK - 1
+               RELMAXC2NRMK = ZERO
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,KK:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + KK - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third stopping criteria.
+*           NOTE: There is no need to test for ABSTOL >= ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL >= ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+*              Set K, the number of factorized columns.
+*
+               K = KK - 1
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KK:MINMNFACT) to CZERO.
+*
+               DO J = KK, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,KK:N):
+*        1) swap the KK-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) copy the KK-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. ( Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than KK in the next loop step.)
+*        3) Save the pivot interchange with the indices relative to the
+*           the original matrix A, not the block A(1:M,1:N).
+*
+         IF( KP.NE.KK ) THEN
+            CALL ZSWAP( M, A( 1, KP ), 1, A( 1, KK ), 1 )
+            VN1( KP ) = VN1( KK )
+            VN2( KP ) = VN2( KK )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( KK )
+            JPIV( KK ) = ITEMP
+         END IF
+*
+*        Generate elementary reflector H(KK) using the column A(I:M,KK),
+*        if the column has more than one element, otherwise
+*        the elementary reflector would be an identity matrix,
+*        and TAU(KK) = CZERO.
+*
+         IF( I.LT.M ) THEN
+            CALL ZLARFG( M-I+1, A( I, KK ), A( I+1, KK ), 1,
+     $                   TAU( KK ) )
+         ELSE
+            TAU( KK ) = CZERO
+         END IF
+*
+*        Check if TAU(KK) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(KK) for Inf,
+*        since ZLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by ZLARFG can contain Inf, which requires
+*        TAU(KK) to contain NaN. Therefore, this case of generating Inf
+*        by ZLARFG is covered by checking TAU(KK) for NaN.
+*
+         IF( DISNAN( DBLE( TAU(KK) ) ) ) THEN
+            TAUNAN = DBLE( TAU(KK) )
+         ELSE IF( DISNAN( DIMAG( TAU(KK) ) ) ) THEN
+            TAUNAN = DIMAG( TAU(KK) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( DISNAN( TAUNAN ) ) THEN
+            K = KK - 1
+            INFO = KK
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           Array TAU(KK:MINMNFACT) is not set and contains
+*           undefined elements, except the first element TAU(KK) = NaN.
+*
+            RETURN
+         END IF
+*
+*        Apply H(KK)**H to A(I:M,KK+1:N+NRHS) from the left.
+*        ( If M >= N, then at KK = N there is no residual matrix,
+*         i.e. no columns of A to update, only columns of B.
+*         If M < N, then at KK = M-IOFFSET, I = M and we have a
+*         one-row residual matrix in A and the elementary
+*         reflector is a unit matrix, TAU(KK) = CZERO, i.e. no update
+*         is needed for the residual matrix in A and the
+*         right-hand-side-matrix in B.
+*         Therefore, we update only if
+*         KK < MINMNUPDT = min(M-IOFFSET, N+NRHS)
+*         condition is satisfied, not only KK < N+NRHS )
+*
+         IF( KK.LT.MINMNUPDT ) THEN
+            AIKK = A( I, KK )
+            A( I, KK ) = CONE
+            CALL ZLARF( 'Left', M-I+1, N+NRHS-KK, A( I, KK ), 1,
+     $                  DCONJG( TAU( KK ) ), A( I, KK+1 ), LDA,
+     $                  WORK( 1 ) )
+            A( I, KK ) = AIKK
+         END IF
+*
+         IF( KK.LT.MINMNFACT ) THEN
+*
+*           Update the partial column 2-norms for the residual matrix,
+*           only if the residual matrix A(I+1:M,KK+1:N) exists, i.e.
+*           when KK < min(M-IOFFSET, N).
+*
+            DO J = KK + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ONE - ( ABS( A( I, J ) ) / VN1( J ) )**2
+                  TEMP = MAX( TEMP, ZERO )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2 .LE. TOL3Z ) THEN
+*
+*                    Compute the column 2-norm for the partial
+*                    column A(I+1:M,J) by explicitly computing it,
+*                    and store it in both partial 2-norm vector VN1
+*                    and exact column 2-norm vector VN2.
+*
+                     VN1( J ) = DZNRM2( M-I, A( I+1, J ), 1 )
+                     VN2( J ) = VN1( J )
+*
+                  ELSE
+*
+*                    Update the column 2-norm for the partial
+*                    column A(I+1:M,J) by removing one
+*                    element A(I,J) and store it in partial
+*                    2-norm vector VN1.
+*
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+*
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*     End factorization loop
+*
+      END DO
+*
+*     If we reached this point, all colunms have been factorized,
+*     i.e. no condition was triggered to exit the routine.
+*     Set the number of factorized columns.
+*
+      K = KMAX
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, we need to set MAXC2NRMK and RELMAXC2NRMK before
+*     we return.
+*
+      IF( K.LT.MINMNFACT ) THEN
+*
+         JMAXC2NRM = K + IDAMAX( N-K, VN1( K+1 ), 1 )
+         MAXC2NRMK = VN1( JMAXC2NRM )
+*
+         IF( K.EQ.0 ) THEN
+            RELMAXC2NRMK = ONE
+         ELSE
+            RELMAXC2NRMK = MAXC2NRMK / MAXC2NRM
+         END IF
+*
+      ELSE
+         MAXC2NRMK = ZERO
+         RELMAXC2NRMK = ZERO
+      END IF
+*
+*     We reached the end of the loop, i.e. all KMAX columns were
+*     factorized, set TAUs corresponding to the columns that were
+*     not factorized to ZERO, i.e. TAU(K+1:MINMNFACT) set to CZERO.
+*
+      DO J = K + 1, MINMNFACT
+         TAU( J ) = CZERO
+      END DO
+*
+      RETURN
+*
+*     End of ZLAQP2RK
+*
+      END
diff --git a/lapack-netlib/SRC/zlaqp3rk.c b/lapack-netlib/SRC/zlaqp3rk.c
new file mode 100644
index 000000000..cb44e4d34
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp3rk.c
@@ -0,0 +1,1157 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* Table of constant values */
+
+static doublecomplex c_b1 = {0.,0.};
+static doublecomplex c_b2 = {1.,0.};
+static integer c__1 = 1;
+
+/* Subroutine */ int zlaqp3rk_(integer *m, integer *n, integer *nrhs, integer 
+	*ioffset, integer *nb, doublereal *abstol, doublereal *reltol, 
+	integer *kp1, doublereal *maxc2nrm, doublecomplex *a, integer *lda, 
+	logical *done, integer *kb, doublereal *maxc2nrmk, doublereal *
+	relmaxc2nrmk, integer *jpiv, doublecomplex *tau, doublereal *vn1, 
+	doublereal *vn2, doublecomplex *auxv, doublecomplex *f, integer *ldf, 
+	integer *iwork, integer *info)
+{
+    /* System generated locals */
+    integer a_dim1, a_offset, f_dim1, f_offset, i__1, i__2, i__3;
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal temp, temp2;
+    integer i__, j, k;
+    doublereal tol3z;
+    integer itemp;
+    extern /* Subroutine */ int zgemm_(char *, char *, integer *, integer *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *, doublecomplex *, doublecomplex *, 
+	    integer *), zgemv_(char *, integer *, integer *, 
+	    doublecomplex *, doublecomplex *, integer *, doublecomplex *, 
+	    integer *, doublecomplex *, doublecomplex *, integer *);
+    integer minmnfact;
+    extern /* Subroutine */ int zswap_(integer *, doublecomplex *, integer *, 
+	    doublecomplex *, integer *);
+    doublereal myhugeval;
+    integer minmnupdt;
+    extern doublereal dznrm2_(integer *, doublecomplex *, integer *);
+    integer if__;
+    extern doublereal dlamch_(char *);
+    integer kp;
+    extern integer idamax_(integer *, doublereal *, integer *);
+    extern logical disnan_(doublereal *);
+    integer lsticc;
+    extern /* Subroutine */ int zlarfg_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *, doublecomplex *);
+    doublereal taunan;
+    doublecomplex aik;
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/*  ===================================================================== */
+
+
+/*     Initialize INFO */
+
+    /* Parameter adjustments */
+    a_dim1 = *lda;
+    a_offset = 1 + a_dim1 * 1;
+    a -= a_offset;
+    --jpiv;
+    --tau;
+    --vn1;
+    --vn2;
+    --auxv;
+    f_dim1 = *ldf;
+    f_offset = 1 + f_dim1 * 1;
+    f -= f_offset;
+    --iwork;
+
+    /* Function Body */
+    *info = 0;
+
+/*     MINMNFACT in the smallest dimension of the submatrix */
+/*     A(IOFFSET+1:M,1:N) to be factorized. */
+
+/* Computing MIN */
+    i__1 = *m - *ioffset;
+    minmnfact = f2cmin(i__1,*n);
+/* Computing MIN */
+    i__1 = *m - *ioffset, i__2 = *n + *nrhs;
+    minmnupdt = f2cmin(i__1,i__2);
+    *nb = f2cmin(*nb,minmnfact);
+    tol3z = sqrt(dlamch_("Epsilon"));
+    myhugeval = dlamch_("Overflow");
+
+/*     Compute factorization in a while loop over NB columns, */
+/*     K is the column index in the block A(1:M,1:N). */
+
+    k = 0;
+    lsticc = 0;
+    *done = FALSE_;
+
+    while(k < *nb && lsticc == 0) {
+	++k;
+	i__ = *ioffset + k;
+
+	if (i__ == 1) {
+
+/*           We are at the first column of the original whole matrix A_orig, */
+/*           therefore we use the computed KP1 and MAXC2NRM from the */
+/*           main routine. */
+
+	    kp = *kp1;
+
+	} else {
+
+/*           Determine the pivot column in K-th step, i.e. the index */
+/*           of the column with the maximum 2-norm in the */
+/*           submatrix A(I:M,K:N). */
+
+	    i__1 = *n - k + 1;
+	    kp = k - 1 + idamax_(&i__1, &vn1[k], &c__1);
+
+/*           Determine the maximum column 2-norm and the relative maximum */
+/*           column 2-norm of the submatrix A(I:M,K:N) in step K. */
+
+	    *maxc2nrmk = vn1[kp];
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains NaN, set */
+/*           INFO parameter to the column number, where the first NaN */
+/*           is found and return from the routine. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (disnan_(maxc2nrmk)) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*info = *kb + kp;
+
+/*              Set RELMAXC2NRMK to NaN. */
+
+		*relmaxc2nrmk = *maxc2nrmk;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix contains NaN and we stop */
+/*              the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*              undefined elements. */
+
+/*              Return from the routine. */
+
+		return 0;
+	    }
+
+/*           Quick return, if the submatrix A(I:M,K:N) is */
+/*           a zero matrix. We need to check it only if the column index */
+/*           (same as row index) is larger than 1, since the condition */
+/*           for the whole original matrix A_orig is checked in the main */
+/*           routine. */
+
+	    if (*maxc2nrmk == 0.) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig. */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+		*relmaxc2nrmk = 0.;
+
+/*              There is no need to apply the block reflector to the */
+/*              residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*              since the submatrix is zero and we stop the computation. */
+/*              But, we need to apply the block reflector to the residual */
+/*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*              residual right hand sides exist.  This occurs */
+/*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+		if (*nrhs > 0 && *kb < *m - *ioffset) {
+		    i__1 = *m - if__;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs,
+			     kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 
+			    + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * 
+			    a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0., tau[i__2].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           Check if the submatrix A(I:M,K:N) contains Inf, */
+/*           set INFO parameter to the column number, where */
+/*           the first Inf is found plus N, and continue */
+/*           the computation. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    if (*info == 0 && *maxc2nrmk > myhugeval) {
+		*info = *n + k - 1 + kp;
+	    }
+
+/*           ============================================================ */
+
+/*           Test for the second and third tolerance stopping criteria. */
+/*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since */
+/*           MAXC2NRMK is non-negative. Similarly, there is no need */
+/*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is */
+/*           non-negative. */
+/*           We need to check the condition only if the */
+/*           column index (same as row index) of the original whole */
+/*           matrix is larger than 1, since the condition for whole */
+/*           original matrix is checked in the main routine. */
+
+	    *relmaxc2nrmk = *maxc2nrmk / *maxc2nrm;
+
+	    if (*maxc2nrmk <= *abstol || *relmaxc2nrmk <= *reltol) {
+
+		*done = TRUE_;
+
+/*              Set KB, the number of factorized partial columns */
+/*                      that are non-zero in each step in the block, */
+/*                      i.e. the rank of the factor R. */
+/*              Set IF, the number of processed rows in the block, which */
+/*                      is the same as the number of processed rows in */
+/*                      the original whole matrix A_orig; */
+
+		*kb = k - 1;
+		if__ = i__ - 1;
+
+/*              Apply the block reflector to the residual of the */
+/*              matrix A and the residual of the right hand sides B, if */
+/*              the residual matrix and and/or the residual of the right */
+/*              hand sides exist,  i.e. if the submatrix */
+/*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when */
+/*                 KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+		if (*kb < minmnupdt) {
+		    i__1 = *m - if__;
+		    i__2 = *n + *nrhs - *kb;
+		    z__1.r = -1., z__1.i = 0.;
+		    zgemm_("No transpose", "Conjugate transpose", &i__1, &
+			    i__2, kb, &z__1, &a[if__ + 1 + a_dim1], lda, &f[*
+			    kb + 1 + f_dim1], ldf, &c_b2, &a[if__ + 1 + (*kb 
+			    + 1) * a_dim1], lda);
+		}
+
+/*              There is no need to recompute the 2-norm of the */
+/*              difficult columns, since we stop the factorization. */
+
+/*              Set TAUs corresponding to the columns that were not */
+/*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO, */
+/*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO. */
+
+		i__1 = minmnfact;
+		for (j = k; j <= i__1; ++j) {
+		    i__2 = j;
+		    tau[i__2].r = 0., tau[i__2].i = 0.;
+		}
+
+/*              Return from the routine. */
+
+		return 0;
+
+	    }
+
+/*           ============================================================ */
+
+/*           End ELSE of IF(I.EQ.1) */
+
+	}
+
+/*        =============================================================== */
+
+/*        If the pivot column is not the first column of the */
+/*        subblock A(1:M,K:N): */
+/*        1) swap the K-th column and the KP-th pivot column */
+/*           in A(1:M,1:N); */
+/*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1) */
+/*        3) copy the K-th element into the KP-th element of the partial */
+/*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed */
+/*           for VN1 and VN2 since we use the element with the index */
+/*           larger than K in the next loop step.) */
+/*        4) Save the pivot interchange with the indices relative to the */
+/*           the original matrix A_orig, not the block A(1:M,1:N). */
+
+	if (kp != k) {
+	    zswap_(m, &a[kp * a_dim1 + 1], &c__1, &a[k * a_dim1 + 1], &c__1);
+	    i__1 = k - 1;
+	    zswap_(&i__1, &f[kp + f_dim1], ldf, &f[k + f_dim1], ldf);
+	    vn1[kp] = vn1[k];
+	    vn2[kp] = vn2[k];
+	    itemp = jpiv[kp];
+	    jpiv[kp] = jpiv[k];
+	    jpiv[k] = itemp;
+	}
+
+/*        Apply previous Householder reflectors to column K: */
+/*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H. */
+
+	if (k > 1) {
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		d_cnjg(&z__1, &f[k + j * f_dim1]);
+		f[i__2].r = z__1.r, f[i__2].i = z__1.i;
+	    }
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    z__1.r = -1., z__1.i = 0.;
+	    zgemv_("No transpose", &i__1, &i__2, &z__1, &a[i__ + a_dim1], lda,
+		     &f[k + f_dim1], ldf, &c_b2, &a[i__ + k * a_dim1], &c__1);
+	    i__1 = k - 1;
+	    for (j = 1; j <= i__1; ++j) {
+		i__2 = k + j * f_dim1;
+		d_cnjg(&z__1, &f[k + j * f_dim1]);
+		f[i__2].r = z__1.r, f[i__2].i = z__1.i;
+	    }
+	}
+
+/*        Generate elementary reflector H(k) using the column A(I:M,K). */
+
+	if (i__ < *m) {
+	    i__1 = *m - i__ + 1;
+	    zlarfg_(&i__1, &a[i__ + k * a_dim1], &a[i__ + 1 + k * a_dim1], &
+		    c__1, &tau[k]);
+	} else {
+	    i__1 = k;
+	    tau[i__1].r = 0., tau[i__1].i = 0.;
+	}
+
+/*        Check if TAU(K) contains NaN, set INFO parameter */
+/*        to the column number where NaN is found and return from */
+/*        the routine. */
+/*        NOTE: There is no need to check TAU(K) for Inf, */
+/*        since ZLARFG cannot produce TAU(KK) or Householder vector */
+/*        below the diagonal containing Inf. Only BETA on the diagonal, */
+/*        returned by ZLARFG can contain Inf, which requires */
+/*        TAU(K) to contain NaN. Therefore, this case of generating Inf */
+/*        by ZLARFG is covered by checking TAU(K) for NaN. */
+
+	i__1 = k;
+	d__1 = tau[i__1].r;
+	if (disnan_(&d__1)) {
+	    i__1 = k;
+	    taunan = tau[i__1].r;
+	} else /* if(complicated condition) */ {
+	    d__1 = d_imag(&tau[k]);
+	    if (disnan_(&d__1)) {
+		taunan = d_imag(&tau[k]);
+	    } else {
+		taunan = 0.;
+	    }
+	}
+
+	if (disnan_(&taunan)) {
+
+	    *done = TRUE_;
+
+/*           Set KB, the number of factorized partial columns */
+/*                   that are non-zero in each step in the block, */
+/*                   i.e. the rank of the factor R. */
+/*           Set IF, the number of processed rows in the block, which */
+/*                   is the same as the number of processed rows in */
+/*                   the original whole matrix A_orig. */
+
+	    *kb = k - 1;
+	    if__ = i__ - 1;
+	    *info = k;
+
+/*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN. */
+
+	    *maxc2nrmk = taunan;
+	    *relmaxc2nrmk = taunan;
+
+/*           There is no need to apply the block reflector to the */
+/*           residual of the matrix A stored in A(KB+1:M,KB+1:N), */
+/*           since the submatrix contains NaN and we stop */
+/*           the computation. */
+/*           But, we need to apply the block reflector to the residual */
+/*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the */
+/*           residual right hand sides exist.  This occurs */
+/*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ): */
+
+/*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) - */
+/*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H. */
+
+	    if (*nrhs > 0 && *kb < *m - *ioffset) {
+		i__1 = *m - if__;
+		z__1.r = -1., z__1.i = 0.;
+		zgemm_("No transpose", "Conjugate transpose", &i__1, nrhs, kb,
+			 &z__1, &a[if__ + 1 + a_dim1], lda, &f[*n + 1 + 
+			f_dim1], ldf, &c_b2, &a[if__ + 1 + (*n + 1) * a_dim1],
+			 lda);
+	    }
+
+/*           There is no need to recompute the 2-norm of the */
+/*           difficult columns, since we stop the factorization. */
+
+/*           Array TAU(KF+1:MINMNFACT) is not set and contains */
+/*           undefined elements. */
+
+/*           Return from the routine. */
+
+	    return 0;
+	}
+
+/*        =============================================================== */
+
+	i__1 = i__ + k * a_dim1;
+	aik.r = a[i__1].r, aik.i = a[i__1].i;
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = 1., a[i__1].i = 0.;
+
+/*        =============================================================== */
+
+/*        Compute the current K-th column of F: */
+/*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K). */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = *n + *nrhs - k;
+	    zgemv_("Conjugate transpose", &i__1, &i__2, &tau[k], &a[i__ + (k 
+		    + 1) * a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &
+		    f[k + 1 + k * f_dim1], &c__1);
+	}
+
+/*           2) Zero out elements above and on the diagonal of the */
+/*              column K in matrix F, i.e elements F(1:K,K). */
+
+	i__1 = k;
+	for (j = 1; j <= i__1; ++j) {
+	    i__2 = j + k * f_dim1;
+	    f[i__2].r = 0., f[i__2].i = 0.;
+	}
+
+/*         3) Incremental updating of the K-th column of F: */
+/*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H */
+/*                    * A(I:M,K). */
+
+	if (k > 1) {
+	    i__1 = *m - i__ + 1;
+	    i__2 = k - 1;
+	    i__3 = k;
+	    z__1.r = -tau[i__3].r, z__1.i = -tau[i__3].i;
+	    zgemv_("Conjugate Transpose", &i__1, &i__2, &z__1, &a[i__ + 
+		    a_dim1], lda, &a[i__ + k * a_dim1], &c__1, &c_b1, &auxv[1]
+		    , &c__1);
+
+	    i__1 = *n + *nrhs;
+	    i__2 = k - 1;
+	    zgemv_("No transpose", &i__1, &i__2, &c_b2, &f[f_dim1 + 1], ldf, &
+		    auxv[1], &c__1, &c_b2, &f[k * f_dim1 + 1], &c__1);
+	}
+
+/*        =============================================================== */
+
+/*        Update the current I-th row of A: */
+/*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS) */
+/*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H. */
+
+	if (k < *n + *nrhs) {
+	    i__1 = *n + *nrhs - k;
+	    z__1.r = -1., z__1.i = 0.;
+	    zgemm_("No transpose", "Conjugate transpose", &c__1, &i__1, &k, &
+		    z__1, &a[i__ + a_dim1], lda, &f[k + 1 + f_dim1], ldf, &
+		    c_b2, &a[i__ + (k + 1) * a_dim1], lda);
+	}
+
+	i__1 = i__ + k * a_dim1;
+	a[i__1].r = aik.r, a[i__1].i = aik.i;
+
+/*        Update the partial column 2-norms for the residual matrix, */
+/*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e. */
+/*        when K < MINMNFACT = f2cmin( M-IOFFSET, N ). */
+
+	if (k < minmnfact) {
+
+	    i__1 = *n;
+	    for (j = k + 1; j <= i__1; ++j) {
+		if (vn1[j] != 0.) {
+
+/*                 NOTE: The following lines follow from the analysis in */
+/*                 Lapack Working Note 176. */
+
+		    temp = z_abs(&a[i__ + j * a_dim1]) / vn1[j];
+/* Computing MAX */
+		    d__1 = 0., d__2 = (temp + 1.) * (1. - temp);
+		    temp = f2cmax(d__1,d__2);
+/* Computing 2nd power */
+		    d__1 = vn1[j] / vn2[j];
+		    temp2 = temp * (d__1 * d__1);
+		    if (temp2 <= tol3z) {
+
+/*                    At J-index, we have a difficult column for the */
+/*                    update of the 2-norm. Save the index of the previous */
+/*                    difficult column in IWORK(J-1). */
+/*                    NOTE: ILSTCC > 1, threfore we can use IWORK only */
+/*                    with N-1 elements, where the elements are */
+/*                    shifted by 1 to the left. */
+
+			iwork[j - 1] = lsticc;
+
+/*                    Set the index of the last difficult column LSTICC. */
+
+			lsticc = j;
+
+		    } else {
+			vn1[j] *= sqrt(temp);
+		    }
+		}
+	    }
+
+	}
+
+/*        End of while loop. */
+
+    }
+
+/*     Now, afler the loop: */
+/*        Set KB, the number of factorized columns in the block; */
+/*        Set IF, the number of processed rows in the block, which */
+/*                is the same as the number of processed rows in */
+/*                the original whole matrix A_orig, IF = IOFFSET + KB. */
+
+    *kb = k;
+    if__ = i__;
+
+/*     Apply the block reflector to the residual of the matrix A */
+/*     and the residual of the right hand sides B, if the residual */
+/*     matrix and and/or the residual of the right hand sides */
+/*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists. */
+/*     This occurs when KB < MINMNUPDT = f2cmin( M-IOFFSET, N+NRHS ): */
+
+/*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) - */
+/*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H. */
+
+    if (*kb < minmnupdt) {
+	i__1 = *m - if__;
+	i__2 = *n + *nrhs - *kb;
+	z__1.r = -1., z__1.i = 0.;
+	zgemm_("No transpose", "Conjugate transpose", &i__1, &i__2, kb, &z__1,
+		 &a[if__ + 1 + a_dim1], lda, &f[*kb + 1 + f_dim1], ldf, &c_b2,
+		 &a[if__ + 1 + (*kb + 1) * a_dim1], lda);
+    }
+
+/*     Recompute the 2-norm of the difficult columns. */
+/*     Loop over the index of the difficult columns from the largest */
+/*     to the smallest index. */
+
+    while(lsticc > 0) {
+
+/*        LSTICC is the index of the last difficult column is greater */
+/*        than 1. */
+/*        ITEMP is the index of the previous difficult column. */
+
+	itemp = iwork[lsticc - 1];
+
+/*        Compute the 2-norm explicilty for the last difficult column and */
+/*        save it in the partial and exact 2-norm vectors VN1 and VN2. */
+
+/*        NOTE: The computation of VN1( LSTICC ) relies on the fact that */
+/*        DZNRM2 does not fail on vectors with norm below the value of */
+/*        SQRT(DLAMCH('S')) */
+
+	i__1 = *m - if__;
+	vn1[lsticc] = dznrm2_(&i__1, &a[if__ + 1 + lsticc * a_dim1], &c__1);
+	vn2[lsticc] = vn1[lsticc];
+
+/*        Downdate the index of the last difficult column to */
+/*        the index of the previous difficult column. */
+
+	lsticc = itemp;
+
+    }
+
+    return 0;
+
+/*     End of ZLAQP3RK */
+
+} /* zlaqp3rk_ */
+
diff --git a/lapack-netlib/SRC/zlaqp3rk.f b/lapack-netlib/SRC/zlaqp3rk.f
new file mode 100644
index 000000000..0dd8bf8e3
--- /dev/null
+++ b/lapack-netlib/SRC/zlaqp3rk.f
@@ -0,0 +1,947 @@
+*> \brief \b ZLAQP3RK computes a step of truncated QR factorization with column pivoting of a complex m-by-n matrix A using Level 3 BLAS and overwrites a complex m-by-nrhs matrix B with Q**H * B.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLAQP3RK + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlaqp3rk.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlaqp3rk.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlaqp3rk.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+*     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+*     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+*     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+*      IMPLICIT NONE
+*      LOGICAL            DONE
+*      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+*     $                   NB, NRHS
+*      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+*     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+*      INTEGER            IWORK( * ), JPIV( * )
+*      DOUBLE PRECISION   VN1( * ), VN2( * )
+*      COMPLEX*16         A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLAQP3RK computes a step of truncated QR factorization with column
+*> pivoting of a complex M-by-N matrix A block A(IOFFSET+1:M,1:N)
+*> by using Level 3 BLAS as
+*>
+*>   A * P(KB) = Q(KB) * R(KB).
+*>
+*> The routine tries to factorize NB columns from A starting from
+*> the row IOFFSET+1 and updates the residual matrix with BLAS 3
+*> xGEMM. The number of actually factorized columns is returned
+*> is smaller than NB.
+*>
+*> Block A(1:IOFFSET,1:N) is accordingly pivoted, but not factorized.
+*>
+*> The routine also overwrites the right-hand-sides B matrix stored
+*> in A(IOFFSET+1:M,1:N+1:N+NRHS) with Q(KB)**H * B.
+*>
+*> Cases when the number of factorized columns KB < NB:
+*>
+*> (1) In some cases, due to catastrophic cancellations, it cannot
+*> factorize all NB columns and need to update the residual matrix.
+*> Hence, the actual number of factorized columns in the block returned
+*> in KB is smaller than NB. The logical DONE is returned as FALSE.
+*> The factorization of the whole original matrix A_orig must proceed
+*> with the next block.
+*>
+*> (2) Whenever the stopping criterion ABSTOL or RELTOL is satisfied,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB.
+*>
+*> (3) In case both stopping criteria ABSTOL or RELTOL are not used,
+*> and when the residual matrix is a zero matrix in some factorization
+*> step KB, the factorization of the whole original matrix A_orig is
+*> stopped, the logical DONE is returned as TRUE. The number of
+*> factorized columns which is smaller than NB is returned in KB.
+*>
+*> (4) Whenever NaN is detected in the matrix A or in the array TAU,
+*> the factorization of the whole original matrix A_orig is stopped,
+*> the logical DONE is returned as TRUE. The number of factorized
+*> columns which is smaller than NB is returned in KB. The INFO
+*> parameter is set to the column index of the first NaN occurrence.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. N >= 0
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of
+*>          columns of the matrix B. NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in] IOFFSET
+*> \verbatim
+*>          IOFFSET is INTEGER
+*>          The number of rows of the matrix A that must be pivoted
+*>          but not factorized. IOFFSET >= 0.
+*>
+*>          IOFFSET also represents the number of columns of the whole
+*>          original matrix A_orig that have been factorized
+*>          in the previous steps.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          Factorization block size, i.e the number of columns
+*>          to factorize in the matrix A. 0 <= NB
+*>
+*>          If NB = 0, then the routine exits immediately.
+*>             This means that the factorization is not performed,
+*>             the matrices A and B and the arrays TAU, IPIV
+*>             are not modified.
+*> \endverbatim
+*>
+*> \param[in] ABSTOL
+*> \verbatim
+*>          ABSTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The absolute tolerance (stopping threshold) for
+*>          maximum column 2-norm of the residual matrix.
+*>          The algorithm converges (stops the factorization) when
+*>          the maximum column 2-norm of the residual matrix
+*>          is less than or equal to ABSTOL.
+*>
+*>          a) If ABSTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and RELTOL.
+*>                This includes the case ABSTOL = -Inf.
+*>
+*>          b) If 0.0 <= ABSTOL then the input value
+*>                of ABSTOL is used.
+*> \endverbatim
+*>
+*> \param[in] RELTOL
+*> \verbatim
+*>          RELTOL is DOUBLE PRECISION, cannot be NaN.
+*>
+*>          The tolerance (stopping threshold) for the ratio of the
+*>          maximum column 2-norm of the residual matrix to the maximum
+*>          column 2-norm of the original matrix A_orig. The algorithm
+*>          converges (stops the factorization), when this ratio is
+*>          less than or equal to RELTOL.
+*>
+*>          a) If RELTOL < 0.0, then this stopping criterion is not
+*>                used, the routine factorizes columns depending
+*>                on NB and ABSTOL.
+*>                This includes the case RELTOL = -Inf.
+*>
+*>          d) If 0.0 <= RELTOL then the input value of RELTOL
+*>                is used.
+*> \endverbatim
+*>
+*> \param[in] KP1
+*> \verbatim
+*>          KP1 is INTEGER
+*>          The index of the column with the maximum 2-norm in
+*>          the whole original matrix A_orig determined in the
+*>          main routine ZGEQP3RK. 1 <= KP1 <= N_orig.
+*> \endverbatim
+*>
+*> \param[in] MAXC2NRM
+*> \verbatim
+*>          MAXC2NRM is DOUBLE PRECISION
+*>          The maximum column 2-norm of the whole original
+*>          matrix A_orig computed in the main routine ZGEQP3RK.
+*>          MAXC2NRM >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N+NRHS)
+*>          On entry:
+*>              the M-by-N matrix A and M-by-NRHS matrix B, as in
+*>
+*>                                  N     NRHS
+*>              array_A   =   M  [ mat_A, mat_B ]
+*>
+*>          On exit:
+*>          1. The elements in block A(IOFFSET+1:M,1:KB) below
+*>             the diagonal together with the array TAU represent
+*>             the unitary matrix Q(KB) as a product of elementary
+*>             reflectors.
+*>          2. The upper triangular block of the matrix A stored
+*>             in A(IOFFSET+1:M,1:KB) is the triangular factor obtained.
+*>          3. The block of the matrix A stored in A(1:IOFFSET,1:N)
+*>             has been accordingly pivoted, but not factorized.
+*>          4. The rest of the array A, block A(IOFFSET+1:M,KB+1:N+NRHS).
+*>             The left part A(IOFFSET+1:M,KB+1:N) of this block
+*>             contains the residual of the matrix A, and,
+*>             if NRHS > 0, the right part of the block
+*>             A(IOFFSET+1:M,N+1:N+NRHS) contains the block of
+*>             the right-hand-side matrix B. Both these blocks have been
+*>             updated by multiplication from the left by Q(KB)**H.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A. LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out]
+*> \verbatim
+*>          DONE is LOGICAL
+*>          TRUE: a) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to ABSTOL
+*>                   or RELTOL criterion,
+*>                b) if the factorization completed before processing
+*>                   all min(M-IOFFSET,NB,N) columns due to the
+*>                   residual matrix being a ZERO matrix.
+*>                c) when NaN was detected in the matrix A
+*>                   or in the array TAU.
+*>          FALSE: otherwise.
+*> \endverbatim
+*>
+*> \param[out] KB
+*> \verbatim
+*>          KB is INTEGER
+*>          Factorization rank of the matrix A, i.e. the rank of
+*>          the factor R, which is the same as the number of non-zero
+*>          rows of the factor R.  0 <= KB <= min(M-IOFFSET,NB,N).
+*>
+*>          KB also represents the number of non-zero Householder
+*>          vectors.
+*> \endverbatim
+*>
+*> \param[out] MAXC2NRMK
+*> \verbatim
+*>          MAXC2NRMK is DOUBLE PRECISION
+*>          The maximum column 2-norm of the residual matrix,
+*>          when the factorization stopped at rank KB. MAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] RELMAXC2NRMK
+*> \verbatim
+*>          RELMAXC2NRMK is DOUBLE PRECISION
+*>          The ratio MAXC2NRMK / MAXC2NRM of the maximum column
+*>          2-norm of the residual matrix (when the factorization
+*>          stopped at rank KB) to the maximum column 2-norm of the
+*>          original matrix A_orig. RELMAXC2NRMK >= 0.
+*> \endverbatim
+*>
+*> \param[out] JPIV
+*> \verbatim
+*>          JPIV is INTEGER array, dimension (N)
+*>          Column pivot indices, for 1 <= j <= N, column j
+*>          of the matrix A was interchanged with column JPIV(j).
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (min(M-IOFFSET,N))
+*>          The scalar factors of the elementary reflectors.
+*> \endverbatim
+*>
+*> \param[in,out] VN1
+*> \verbatim
+*>          VN1 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the partial column norms.
+*> \endverbatim
+*>
+*> \param[in,out] VN2
+*> \verbatim
+*>          VN2 is DOUBLE PRECISION array, dimension (N)
+*>          The vector with the exact column norms.
+*> \endverbatim
+*>
+*> \param[out] AUXV
+*> \verbatim
+*>          AUXV is COMPLEX*16 array, dimension (NB)
+*>          Auxiliary vector.
+*> \endverbatim
+*>
+*> \param[out] F
+*> \verbatim
+*>          F is COMPLEX*16 array, dimension (LDF,NB)
+*>          Matrix F**H = L*(Y**H)*A.
+*> \endverbatim
+*>
+*> \param[in] LDF
+*> \verbatim
+*>          LDF is INTEGER
+*>          The leading dimension of the array F. LDF >= max(1,N+NRHS).
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N-1).
+*>          Is a work array. ( IWORK is used to store indices
+*>          of "bad" columns for norm downdating in the residual
+*>          matrix ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          1) INFO = 0: successful exit.
+*>          2) If INFO = j_1, where 1 <= j_1 <= N, then NaN was
+*>             detected and the routine stops the computation.
+*>             The j_1-th column of the matrix A or the j_1-th
+*>             element of array TAU contains the first occurrence
+*>             of NaN in the factorization step KB+1 ( when KB columns
+*>             have been factorized ).
+*>
+*>             On exit:
+*>             KB                  is set to the number of
+*>                                    factorized columns without
+*>                                    exception.
+*>             MAXC2NRMK           is set to NaN.
+*>             RELMAXC2NRMK        is set to NaN.
+*>             TAU(KB+1:min(M,N))     is not set and contains undefined
+*>                                    elements. If j_1=KB+1, TAU(KB+1)
+*>                                    may contain NaN.
+*>          3) If INFO = j_2, where N+1 <= j_2 <= 2*N, then no NaN
+*>             was detected, but +Inf (or -Inf) was detected and
+*>             the routine continues the computation until completion.
+*>             The (j_2-N)-th column of the matrix A contains the first
+*>             occurrence of +Inf (or -Inf) in the actorization
+*>             step KB+1 ( when KB columns have been factorized ).
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup laqp3rk
+*
+*> \par References:
+*  ================
+*> [1] A Level 3 BLAS QR factorization algorithm with column pivoting developed in 1996.
+*> G. Quintana-Orti, Depto. de Informatica, Universidad Jaime I, Spain.
+*> X. Sun, Computer Science Dept., Duke University, USA.
+*> C. H. Bischof, Math. and Comp. Sci. Div., Argonne National Lab, USA.
+*> A BLAS-3 version of the QR factorization with column pivoting.
+*> LAPACK Working Note 114
+*> \htmlonly
+*> <a href="https://www.netlib.org/lapack/lawnspdf/lawn114.pdf">https://www.netlib.org/lapack/lawnspdf/lawn114.pdf</a>
+*> \endhtmlonly
+*> and in
+*> SIAM J. Sci. Comput., 19(5):1486-1494, Sept. 1998.
+*> \htmlonly
+*> <a href="https://doi.org/10.1137/S1064827595296732">https://doi.org/10.1137/S1064827595296732</a>
+*> \endhtmlonly
+*>
+*> [2] A partial column norm updating strategy developed in 2006.
+*> Z. Drmac and Z. Bujanovic, Dept. of Math., University of Zagreb, Croatia.
+*> On the failure of rank revealing QR factorization software – a case study.
+*> LAPACK Working Note 176.
+*> \htmlonly
+*> <a href="http://www.netlib.org/lapack/lawnspdf/lawn176.pdf">http://www.netlib.org/lapack/lawnspdf/lawn176.pdf</a>
+*> \endhtmlonly
+*> and in
+*> ACM Trans. Math. Softw. 35, 2, Article 12 (July 2008), 28 pages.
+*> \htmlonly
+*> <a href="https://doi.org/10.1145/1377612.1377616">https://doi.org/10.1145/1377612.1377616</a>
+*> \endhtmlonly
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*>  November  2023, Igor Kozachenko, James Demmel,
+*>                  EECS Department,
+*>                  University of California, Berkeley, USA.
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZLAQP3RK( M, N, NRHS, IOFFSET, NB, ABSTOL,
+     $                     RELTOL, KP1, MAXC2NRM, A, LDA, DONE, KB,
+     $                     MAXC2NRMK, RELMAXC2NRMK, JPIV, TAU,
+     $                     VN1, VN2, AUXV, F, LDF, IWORK, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      LOGICAL            DONE
+      INTEGER            INFO, IOFFSET, KB, KP1, LDA, LDF, M, N,
+     $                   NB, NRHS
+      DOUBLE PRECISION   ABSTOL, MAXC2NRM, MAXC2NRMK, RELMAXC2NRMK,
+     $                   RELTOL
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IWORK( * ), JPIV( * )
+      DOUBLE PRECISION   VN1( * ), VN2( * )
+      COMPLEX*16         A( LDA, * ), AUXV( * ), F( LDF, * ), TAU( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+      COMPLEX*16         CZERO, CONE
+      PARAMETER          ( CZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            ITEMP, J, K, MINMNFACT, MINMNUPDT,
+     $                   LSTICC, KP, I, IF
+      DOUBLE PRECISION   HUGEVAL, TAUNAN, TEMP, TEMP2, TOL3Z
+      COMPLEX*16         AIK
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMM, ZGEMV, ZLARFG, ZSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCONJG, DIMAG, MAX, MIN, SQRT
+*     ..
+*     .. External Functions ..
+      LOGICAL            DISNAN
+      INTEGER            IDAMAX
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DISNAN, DLAMCH, IDAMAX, DZNRM2
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize INFO
+*
+      INFO = 0
+*
+*     MINMNFACT in the smallest dimension of the submatrix
+*     A(IOFFSET+1:M,1:N) to be factorized.
+*
+      MINMNFACT = MIN( M-IOFFSET, N )
+      MINMNUPDT = MIN( M-IOFFSET, N+NRHS )
+      NB = MIN( NB, MINMNFACT )
+      TOL3Z = SQRT( DLAMCH( 'Epsilon' ) )
+      HUGEVAL = DLAMCH( 'Overflow' )
+*
+*     Compute factorization in a while loop over NB columns,
+*     K is the column index in the block A(1:M,1:N).
+*
+      K = 0
+      LSTICC = 0
+      DONE = .FALSE.
+*
+      DO WHILE ( K.LT.NB .AND. LSTICC.EQ.0 )
+         K = K + 1
+         I = IOFFSET + K
+*
+         IF( I.EQ.1 ) THEN
+*
+*           We are at the first column of the original whole matrix A_orig,
+*           therefore we use the computed KP1 and MAXC2NRM from the
+*           main routine.
+*
+            KP = KP1
+*
+         ELSE
+*
+*           Determine the pivot column in K-th step, i.e. the index
+*           of the column with the maximum 2-norm in the
+*           submatrix A(I:M,K:N).
+*
+            KP = ( K-1 ) + IDAMAX( N-K+1, VN1( K ), 1 )
+*
+*           Determine the maximum column 2-norm and the relative maximum
+*           column 2-norm of the submatrix A(I:M,K:N) in step K.
+*
+            MAXC2NRMK = VN1( KP )
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains NaN, set
+*           INFO parameter to the column number, where the first NaN
+*           is found and return from the routine.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( DISNAN( MAXC2NRMK ) ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               INFO = KB + KP
+*
+*              Set RELMAXC2NRMK to NaN.
+*
+               RELMAXC2NRMK = MAXC2NRMK
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix contains NaN and we stop
+*              the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Array TAU(KF+1:MINMNFACT) is not set and contains
+*              undefined elements.
+*
+*              Return from the routine.
+*
+               RETURN
+            END IF
+*
+*           Quick return, if the submatrix A(I:M,K:N) is
+*           a zero matrix. We need to check it only if the column index
+*           (same as row index) is larger than 1, since the condition
+*           for the whole original matrix A_orig is checked in the main
+*           routine.
+*
+            IF( MAXC2NRMK.EQ.ZERO ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig.
+*
+               KB = K - 1
+               IF = I - 1
+               RELMAXC2NRMK = ZERO
+*
+*              There is no need to apply the block reflector to the
+*              residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*              since the submatrix is zero and we stop the computation.
+*              But, we need to apply the block reflector to the residual
+*              right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*              residual right hand sides exist.  This occurs
+*              when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*              A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                               A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+               IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $                  F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           Check if the submatrix A(I:M,K:N) contains Inf,
+*           set INFO parameter to the column number, where
+*           the first Inf is found plus N, and continue
+*           the computation.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            IF( INFO.EQ.0 .AND. MAXC2NRMK.GT.HUGEVAL ) THEN
+               INFO = N + K - 1 + KP
+            END IF
+*
+*           ============================================================
+*
+*           Test for the second and third tolerance stopping criteria.
+*           NOTE: There is no need to test for ABSTOL.GE.ZERO, since
+*           MAXC2NRMK is non-negative. Similarly, there is no need
+*           to test for RELTOL.GE.ZERO, since RELMAXC2NRMK is
+*           non-negative.
+*           We need to check the condition only if the
+*           column index (same as row index) of the original whole
+*           matrix is larger than 1, since the condition for whole
+*           original matrix is checked in the main routine.
+*
+            RELMAXC2NRMK =  MAXC2NRMK / MAXC2NRM
+*
+            IF( MAXC2NRMK.LE.ABSTOL .OR. RELMAXC2NRMK.LE.RELTOL ) THEN
+*
+               DONE = .TRUE.
+*
+*              Set KB, the number of factorized partial columns
+*                      that are non-zero in each step in the block,
+*                      i.e. the rank of the factor R.
+*              Set IF, the number of processed rows in the block, which
+*                      is the same as the number of processed rows in
+*                      the original whole matrix A_orig;
+*
+                  KB = K - 1
+                  IF = I - 1
+*
+*              Apply the block reflector to the residual of the
+*              matrix A and the residual of the right hand sides B, if
+*              the residual matrix and and/or the residual of the right
+*              hand sides exist,  i.e. if the submatrix
+*              A(I+1:M,KB+1:N+NRHS) exists.  This occurs when
+*                 KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*              A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                             A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+               IF( KB.LT.MINMNUPDT ) THEN
+                  CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $                  M-IF, N+NRHS-KB, KB,-CONE, A( IF+1, 1 ), LDA,
+     $                  F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+               END IF
+*
+*              There is no need to recompute the 2-norm of the
+*              difficult columns, since we stop the factorization.
+*
+*              Set TAUs corresponding to the columns that were not
+*              factorized to ZERO, i.e. set TAU(KB+1:MINMNFACT) = CZERO,
+*              which is equivalent to seting TAU(K:MINMNFACT) = CZERO.
+*
+               DO J = K, MINMNFACT
+                  TAU( J ) = CZERO
+               END DO
+*
+*              Return from the routine.
+*
+               RETURN
+*
+            END IF
+*
+*           ============================================================
+*
+*           End ELSE of IF(I.EQ.1)
+*
+         END IF
+*
+*        ===============================================================
+*
+*        If the pivot column is not the first column of the
+*        subblock A(1:M,K:N):
+*        1) swap the K-th column and the KP-th pivot column
+*           in A(1:M,1:N);
+*        2) swap the K-th row and the KP-th row in F(1:N,1:K-1)
+*        3) copy the K-th element into the KP-th element of the partial
+*           and exact 2-norm vectors VN1 and VN2. (Swap is not needed
+*           for VN1 and VN2 since we use the element with the index
+*           larger than K in the next loop step.)
+*        4) Save the pivot interchange with the indices relative to the
+*           the original matrix A_orig, not the block A(1:M,1:N).
+*
+         IF( KP.NE.K ) THEN
+            CALL ZSWAP( M, A( 1, KP ), 1, A( 1, K ), 1 )
+            CALL ZSWAP( K-1, F( KP, 1 ), LDF, F( K, 1 ), LDF )
+            VN1( KP ) = VN1( K )
+            VN2( KP ) = VN2( K )
+            ITEMP = JPIV( KP )
+            JPIV( KP ) = JPIV( K )
+            JPIV( K ) = ITEMP
+         END IF
+*
+*        Apply previous Householder reflectors to column K:
+*        A(I:M,K) := A(I:M,K) - A(I:M,1:K-1)*F(K,1:K-1)**H.
+*
+         IF( K.GT.1 ) THEN
+            DO J = 1, K - 1
+               F( K, J ) = DCONJG( F( K, J ) )
+            END DO
+            CALL ZGEMV( 'No transpose', M-I+1, K-1, -CONE, A( I, 1 ),
+     $                  LDA, F( K, 1 ), LDF, CONE, A( I, K ), 1 )
+            DO J = 1, K - 1
+               F( K, J ) = DCONJG( F( K, J ) )
+            END DO
+         END IF
+*
+*        Generate elementary reflector H(k) using the column A(I:M,K).
+*
+         IF( I.LT.M ) THEN
+            CALL ZLARFG( M-I+1, A( I, K ), A( I+1, K ), 1, TAU( K ) )
+         ELSE
+            TAU( K ) = CZERO
+         END IF
+*
+*        Check if TAU(K) contains NaN, set INFO parameter
+*        to the column number where NaN is found and return from
+*        the routine.
+*        NOTE: There is no need to check TAU(K) for Inf,
+*        since ZLARFG cannot produce TAU(KK) or Householder vector
+*        below the diagonal containing Inf. Only BETA on the diagonal,
+*        returned by ZLARFG can contain Inf, which requires
+*        TAU(K) to contain NaN. Therefore, this case of generating Inf
+*        by ZLARFG is covered by checking TAU(K) for NaN.
+*
+         IF( DISNAN( DBLE( TAU(K) ) ) ) THEN
+            TAUNAN = DBLE( TAU(K) )
+         ELSE IF( DISNAN( DIMAG( TAU(K) ) ) ) THEN
+            TAUNAN = DIMAG( TAU(K) )
+         ELSE
+            TAUNAN = ZERO
+         END IF
+*
+         IF( DISNAN( TAUNAN ) ) THEN
+*
+            DONE = .TRUE.
+*
+*           Set KB, the number of factorized partial columns
+*                   that are non-zero in each step in the block,
+*                   i.e. the rank of the factor R.
+*           Set IF, the number of processed rows in the block, which
+*                   is the same as the number of processed rows in
+*                   the original whole matrix A_orig.
+*
+            KB = K - 1
+            IF = I - 1
+            INFO = K
+*
+*           Set MAXC2NRMK and  RELMAXC2NRMK to NaN.
+*
+            MAXC2NRMK = TAUNAN
+            RELMAXC2NRMK = TAUNAN
+*
+*           There is no need to apply the block reflector to the
+*           residual of the matrix A stored in A(KB+1:M,KB+1:N),
+*           since the submatrix contains NaN and we stop
+*           the computation.
+*           But, we need to apply the block reflector to the residual
+*           right hand sides stored in A(KB+1:M,N+1:N+NRHS), if the
+*           residual right hand sides exist.  This occurs
+*           when ( NRHS != 0 AND KB <= (M-IOFFSET) ):
+*
+*           A(I+1:M,N+1:N+NRHS) := A(I+1:M,N+1:N+NRHS) -
+*                            A(I+1:M,1:KB) * F(N+1:N+NRHS,1:KB)**H.
+*
+            IF( NRHS.GT.0 .AND. KB.LT.(M-IOFFSET) ) THEN
+               CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $               M-IF, NRHS, KB, -CONE, A( IF+1, 1 ), LDA,
+     $               F( N+1, 1 ), LDF, CONE, A( IF+1, N+1 ), LDA )
+            END IF
+*
+*           There is no need to recompute the 2-norm of the
+*           difficult columns, since we stop the factorization.
+*
+*           Array TAU(KF+1:MINMNFACT) is not set and contains
+*           undefined elements.
+*
+*           Return from the routine.
+*
+            RETURN
+         END IF
+*
+*        ===============================================================
+*
+         AIK = A( I, K )
+         A( I, K ) = CONE
+*
+*        ===============================================================
+*
+*        Compute the current K-th column of F:
+*          1) F(K+1:N,K) := tau(K) * A(I:M,K+1:N)**H * A(I:M,K).
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL ZGEMV( 'Conjugate transpose', M-I+1, N+NRHS-K,
+     $                  TAU( K ), A( I, K+1 ), LDA, A( I, K ), 1,
+     $                  CZERO, F( K+1, K ), 1 )
+         END IF
+*
+*           2) Zero out elements above and on the diagonal of the
+*              column K in matrix F, i.e elements F(1:K,K).
+*
+         DO J = 1, K
+            F( J, K ) = CZERO
+         END DO
+*
+*         3) Incremental updating of the K-th column of F:
+*        F(1:N,K) := F(1:N,K) - tau(K) * F(1:N,1:K-1) * A(I:M,1:K-1)**H
+*                    * A(I:M,K).
+*
+         IF( K.GT.1 ) THEN
+            CALL ZGEMV( 'Conjugate Transpose', M-I+1, K-1, -TAU( K ),
+     $                   A( I, 1 ), LDA, A( I, K ), 1, CZERO,
+     $                   AUXV( 1 ), 1 )
+*
+            CALL ZGEMV( 'No transpose', N+NRHS, K-1, CONE,
+     $                  F( 1, 1 ), LDF, AUXV( 1 ), 1, CONE,
+     $                  F( 1, K ), 1 )
+         END IF
+*
+*        ===============================================================
+*
+*        Update the current I-th row of A:
+*        A(I,K+1:N+NRHS) := A(I,K+1:N+NRHS)
+*                         - A(I,1:K)*F(K+1:N+NRHS,1:K)**H.
+*
+         IF( K.LT.N+NRHS ) THEN
+            CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $            1, N+NRHS-K, K, -CONE, A( I, 1 ), LDA,
+     $            F( K+1, 1 ), LDF, CONE, A( I, K+1 ), LDA )
+         END IF
+*
+         A( I, K ) = AIK
+*
+*        Update the partial column 2-norms for the residual matrix,
+*        only if the residual matrix A(I+1:M,K+1:N) exists, i.e.
+*        when K < MINMNFACT = min( M-IOFFSET, N ).
+*
+         IF( K.LT.MINMNFACT ) THEN
+*
+            DO J = K + 1, N
+               IF( VN1( J ).NE.ZERO ) THEN
+*
+*                 NOTE: The following lines follow from the analysis in
+*                 Lapack Working Note 176.
+*
+                  TEMP = ABS( A( I, J ) ) / VN1( J )
+                  TEMP = MAX( ZERO, ( ONE+TEMP )*( ONE-TEMP ) )
+                  TEMP2 = TEMP*( VN1( J ) / VN2( J ) )**2
+                  IF( TEMP2.LE.TOL3Z ) THEN
+*
+*                    At J-index, we have a difficult column for the
+*                    update of the 2-norm. Save the index of the previous
+*                    difficult column in IWORK(J-1).
+*                    NOTE: ILSTCC > 1, threfore we can use IWORK only
+*                    with N-1 elements, where the elements are
+*                    shifted by 1 to the left.
+*
+                     IWORK( J-1 ) = LSTICC
+*
+*                    Set the index of the last difficult column LSTICC.
+*
+                     LSTICC = J
+*
+                  ELSE
+                     VN1( J ) = VN1( J )*SQRT( TEMP )
+                  END IF
+               END IF
+            END DO
+*
+         END IF
+*
+*        End of while loop.
+*
+      END DO
+*
+*     Now, afler the loop:
+*        Set KB, the number of factorized columns in the block;
+*        Set IF, the number of processed rows in the block, which
+*                is the same as the number of processed rows in
+*                the original whole matrix A_orig, IF = IOFFSET + KB.
+*
+      KB = K
+      IF = I
+*
+*     Apply the block reflector to the residual of the matrix A
+*     and the residual of the right hand sides B, if the residual
+*     matrix and and/or the residual of the right hand sides
+*     exist,  i.e. if the submatrix A(I+1:M,KB+1:N+NRHS) exists.
+*     This occurs when KB < MINMNUPDT = min( M-IOFFSET, N+NRHS ):
+*
+*     A(IF+1:M,K+1:N+NRHS) := A(IF+1:M,KB+1:N+NRHS) -
+*                         A(IF+1:M,1:KB) * F(KB+1:N+NRHS,1:KB)**H.
+*
+      IF( KB.LT.MINMNUPDT ) THEN
+         CALL ZGEMM( 'No transpose', 'Conjugate transpose',
+     $         M-IF, N+NRHS-KB, KB, -CONE, A( IF+1, 1 ), LDA,
+     $         F( KB+1, 1 ), LDF, CONE, A( IF+1, KB+1 ), LDA )
+      END IF
+*
+*     Recompute the 2-norm of the difficult columns.
+*     Loop over the index of the difficult columns from the largest
+*     to the smallest index.
+*
+      DO WHILE( LSTICC.GT.0 )
+*
+*        LSTICC is the index of the last difficult column is greater
+*        than 1.
+*        ITEMP is the index of the previous difficult column.
+*
+         ITEMP = IWORK( LSTICC-1 )
+*
+*        Compute the 2-norm explicilty for the last difficult column and
+*        save it in the partial and exact 2-norm vectors VN1 and VN2.
+*
+*        NOTE: The computation of VN1( LSTICC ) relies on the fact that
+*        DZNRM2 does not fail on vectors with norm below the value of
+*        SQRT(DLAMCH('S'))
+*
+         VN1( LSTICC ) = DZNRM2( M-IF, A( IF+1, LSTICC ), 1 )
+         VN2( LSTICC ) = VN1( LSTICC )
+*
+*        Downdate the index of the last difficult column to
+*        the index of the previous difficult column.
+*
+         LSTICC = ITEMP
+*
+      END DO
+*
+      RETURN
+*
+*     End of ZLAQP3RK
+*
+      END
diff --git a/lapack-netlib/SRC/zlaqz0.f b/lapack-netlib/SRC/zlaqz0.f
index 3e20200ed..dcb28850a 100644
--- a/lapack-netlib/SRC/zlaqz0.f
+++ b/lapack-netlib/SRC/zlaqz0.f
@@ -89,7 +89,7 @@
 *>      Anal., 29(2006), pp. 199--227.
 *>
 *> Ref: T. Steel, D. Camps, K. Meerbergen, R. Vandebril "A multishift,
-*>      multipole rational QZ method with agressive early deflation"
+*>      multipole rational QZ method with aggressive early deflation"
 *> \endverbatim
 *
 *  Arguments:
@@ -312,7 +312,7 @@
       CHARACTER :: JBCMPZ*3
 
 *     External Functions
-      EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET, DLABAD,
+      EXTERNAL :: XERBLA, ZHGEQZ, ZLAQZ2, ZLAQZ3, ZLASET,
      $            ZLARTG, ZROT
       DOUBLE PRECISION, EXTERNAL :: DLAMCH, ZLANHS
       LOGICAL, EXTERNAL :: LSAME
@@ -464,7 +464,6 @@
 *     Get machine constants
       SAFMIN = DLAMCH( 'SAFE MINIMUM' )
       SAFMAX = ONE/SAFMIN
-      CALL DLABAD( SAFMIN, SAFMAX )
       ULP = DLAMCH( 'PRECISION' )
       SMLNUM = SAFMIN*( DBLE( N )/ULP )
 
@@ -535,7 +534,7 @@
          DO WHILE ( K.GE.ISTART2 )
 
             IF( ABS( B( K, K ) ) .LT. BTOL ) THEN
-*              A diagonal element of B is negligable, move it
+*              A diagonal element of B is negligible, move it
 *              to the top and deflate it
                
                DO K2 = K, ISTART2+1, -1
diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f
index 4a3c4dcf1..17d4b33aa 100644
--- a/lapack-netlib/SRC/zlarfb_gett.f
+++ b/lapack-netlib/SRC/zlarfb_gett.f
@@ -452,7 +452,7 @@
          IF( LNOTIDENT ) THEN
 *
 *           col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2,
-*           V1 is not an identy matrix, but unit lower-triangular
+*           V1 is not an identity matrix, but unit lower-triangular
 *           V1 stored in A1 (diagonal ones are not stored).
 *
 *
diff --git a/lapack-netlib/SRC/zlarfgp.f b/lapack-netlib/SRC/zlarfgp.f
index 77eba8e86..d54f2ea5d 100644
--- a/lapack-netlib/SRC/zlarfgp.f
+++ b/lapack-netlib/SRC/zlarfgp.f
@@ -97,7 +97,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERauxiliary
+*> \ingroup larfgp
 *
 *  =====================================================================
       SUBROUTINE ZLARFGP( N, ALPHA, X, INCX, TAU )
@@ -122,7 +122,7 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            J, KNT
-      DOUBLE PRECISION   ALPHI, ALPHR, BETA, BIGNUM, SMLNUM, XNORM
+      DOUBLE PRECISION   ALPHI, ALPHR, BETA, BIGNUM, EPS, SMLNUM, XNORM
       COMPLEX*16         SAVEALPHA
 *     ..
 *     .. External Functions ..
@@ -143,37 +143,28 @@
          RETURN
       END IF
 *
+      EPS = DLAMCH( 'Precision' )
       XNORM = DZNRM2( N-1, X, INCX )
       ALPHR = DBLE( ALPHA )
       ALPHI = DIMAG( ALPHA )
 *
-      IF( XNORM.EQ.ZERO ) THEN
+      IF( XNORM.LE.EPS*ABS(ALPHA) .AND. ALPHI.EQ.ZERO ) THEN
 *
 *        H  =  [1-alpha/abs(alpha) 0; 0 I], sign chosen so ALPHA >= 0.
 *
-         IF( ALPHI.EQ.ZERO ) THEN
-            IF( ALPHR.GE.ZERO ) THEN
-*              When TAU.eq.ZERO, the vector is special-cased to be
-*              all zeros in the application routines.  We do not need
-*              to clear it.
-               TAU = ZERO
-            ELSE
-*              However, the application routines rely on explicit
-*              zero checks when TAU.ne.ZERO, and we must clear X.
-               TAU = TWO
-               DO J = 1, N-1
-                  X( 1 + (J-1)*INCX ) = ZERO
-               END DO
-               ALPHA = -ALPHA
-            END IF
+         IF( ALPHR.GE.ZERO ) THEN
+*           When TAU.eq.ZERO, the vector is special-cased to be
+*           all zeros in the application routines.  We do not need
+*           to clear it.
+            TAU = ZERO
          ELSE
-*           Only "reflecting" the diagonal entry to be real and non-negative.
-            XNORM = DLAPY2( ALPHR, ALPHI )
-            TAU = DCMPLX( ONE - ALPHR / XNORM, -ALPHI / XNORM )
+*           However, the application routines rely on explicit
+*           zero checks when TAU.ne.ZERO, and we must clear X.
+            TAU = TWO
             DO J = 1, N-1
                X( 1 + (J-1)*INCX ) = ZERO
             END DO
-            ALPHA = XNORM
+            ALPHA = -ALPHA
          END IF
       ELSE
 *
diff --git a/lapack-netlib/SRC/zlassq.f90 b/lapack-netlib/SRC/zlassq.f90
index 9346dacac..c35214766 100644
--- a/lapack-netlib/SRC/zlassq.f90
+++ b/lapack-netlib/SRC/zlassq.f90
@@ -34,28 +34,15 @@
 !>
 !> \verbatim
 !>
-!> ZLASSQ  returns the values  scl  and  smsq  such that
+!> ZLASSQ returns the values scale_out and sumsq_out such that
 !>
-!>    ( scl**2 )*smsq = x( 1 )**2 +...+ x( n )**2 + ( scale**2 )*sumsq,
+!>    (scale_out**2)*sumsq_out = x( 1 )**2 +...+ x( n )**2 + (scale**2)*sumsq,
 !>
-!> where  x( i ) = X( 1 + ( i - 1 )*INCX ). The value of  sumsq  is
+!> where x( i ) = X( 1 + ( i - 1 )*INCX ). The value of sumsq is
 !> assumed to be non-negative.
 !>
 !> scale and sumsq must be supplied in SCALE and SUMSQ and
-!> scl and smsq are overwritten on SCALE and SUMSQ respectively.
-!>
-!> If scale * sqrt( sumsq ) > tbig then
-!>    we require:   scale >= sqrt( TINY*EPS ) / sbig   on entry,
-!> and if 0 < scale * sqrt( sumsq ) < tsml then
-!>    we require:   scale <= sqrt( HUGE ) / ssml       on entry,
-!> where
-!>    tbig -- upper threshold for values whose square is representable;
-!>    sbig -- scaling constant for big numbers; \see la_constants.f90
-!>    tsml -- lower threshold for values whose square is representable;
-!>    ssml -- scaling constant for small numbers; \see la_constants.f90
-!> and
-!>    TINY*EPS -- tiniest representable number;
-!>    HUGE     -- biggest representable number.
+!> scale_out and sumsq_out are overwritten on SCALE and SUMSQ respectively.
 !>
 !> \endverbatim
 !
@@ -72,7 +59,7 @@
 !> \verbatim
 !>          X is DOUBLE COMPLEX array, dimension (1+(N-1)*abs(INCX))
 !>          The vector for which a scaled sum of squares is computed.
-!>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
+!>             x( i ) = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 !> \endverbatim
 !>
 !> \param[in] INCX
@@ -82,24 +69,24 @@
 !>          If INCX > 0, X(1+(i-1)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX < 0, X(1-(n-i)*INCX) = x(i) for 1 <= i <= n
 !>          If INCX = 0, x isn't a vector so there is no need to call
-!>          this subroutine.  If you call it anyway, it will count x(1)
+!>          this subroutine. If you call it anyway, it will count x(1)
 !>          in the vector norm N times.
 !> \endverbatim
 !>
 !> \param[in,out] SCALE
 !> \verbatim
 !>          SCALE is DOUBLE PRECISION
-!>          On entry, the value  scale  in the equation above.
-!>          On exit, SCALE is overwritten with  scl , the scaling factor
+!>          On entry, the value scale in the equation above.
+!>          On exit, SCALE is overwritten by scale_out, the scaling factor
 !>          for the sum of squares.
 !> \endverbatim
 !>
 !> \param[in,out] SUMSQ
 !> \verbatim
 !>          SUMSQ is DOUBLE PRECISION
-!>          On entry, the value  sumsq  in the equation above.
-!>          On exit, SUMSQ is overwritten with  smsq , the basic sum of
-!>          squares from which  scl  has been factored out.
+!>          On entry, the value sumsq in the equation above.
+!>          On exit, SUMSQ is overwritten by sumsq_out, the basic sum of
+!>          squares from which scale_out has been factored out.
 !> \endverbatim
 !
 !  Authors:
@@ -130,10 +117,10 @@
 !>
 !> \endverbatim
 !
-!> \ingroup OTHERauxiliary
+!> \ingroup lassq
 !
 !  =====================================================================
-subroutine ZLASSQ( n, x, incx, scl, sumsq )
+subroutine ZLASSQ( n, x, incx, scale, sumsq )
    use LA_CONSTANTS, &
       only: wp=>dp, zero=>dzero, one=>done, &
             sbig=>dsbig, ssml=>dssml, tbig=>dtbig, tsml=>dtsml
@@ -145,7 +132,7 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
 !
 !  .. Scalar Arguments ..
    integer :: incx, n
-   real(wp) :: scl, sumsq
+   real(wp) :: scale, sumsq
 !  ..
 !  .. Array Arguments ..
    complex(wp) :: x(*)
@@ -158,10 +145,10 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
 !
 !  Quick return if possible
 !
-   if( LA_ISNAN(scl) .or. LA_ISNAN(sumsq) ) return
-   if( sumsq == zero ) scl = one
-   if( scl == zero ) then
-      scl = one
+   if( LA_ISNAN(scale) .or. LA_ISNAN(sumsq) ) return
+   if( sumsq == zero ) scale = one
+   if( scale == zero ) then
+      scale = one
       sumsq = zero
    end if
    if (n <= 0) then
@@ -207,15 +194,27 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
 !  Put the existing sum of squares into one of the accumulators
 !
    if( sumsq > zero ) then
-      ax = scl*sqrt( sumsq )
+      ax = scale*sqrt( sumsq )
       if (ax > tbig) then
-!        We assume scl >= sqrt( TINY*EPS ) / sbig
-         abig = abig + (scl*sbig)**2 * sumsq
+         if (scale > one) then
+            scale = scale * sbig
+            abig = abig + scale * (scale * sumsq)
+         else
+            ! sumsq > tbig^2 => (sbig * (sbig * sumsq)) is representable
+            abig = abig + scale * (scale * (sbig * (sbig * sumsq)))
+         end if
       else if (ax < tsml) then
-!        We assume scl <= sqrt( HUGE ) / ssml
-         if (notbig) asml = asml + (scl*ssml)**2 * sumsq
+         if (notbig) then
+            if (scale < one) then
+               scale = scale * ssml
+               asml = asml + scale * (scale * sumsq)
+            else
+               ! sumsq < tsml^2 => (ssml * (ssml * sumsq)) is representable
+               asml = asml + scale * (scale * (ssml * (ssml * sumsq)))
+            end if
+         end if
       else
-         amed = amed + scl**2 * sumsq
+         amed = amed + scale * (scale * sumsq)
       end if
    end if
 !
@@ -229,7 +228,7 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
       if (amed > zero .or. LA_ISNAN(amed)) then
          abig = abig + (amed*sbig)*sbig
       end if
-      scl = one / sbig
+      scale = one / sbig
       sumsq = abig
    else if (asml > zero) then
 !
@@ -245,17 +244,17 @@ subroutine ZLASSQ( n, x, incx, scl, sumsq )
             ymin = asml
             ymax = amed
          end if
-         scl = one
+         scale = one
          sumsq = ymax**2*( one + (ymin/ymax)**2 )
       else
-         scl = one / ssml
+         scale = one / ssml
          sumsq = asml
       end if
    else
 !
 !     Otherwise all values are mid-range or zero
 !
-      scl = one
+      scale = one
       sumsq = amed
    end if
    return
diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f
index e4e703343..735207132 100644
--- a/lapack-netlib/SRC/zlaswlq.f
+++ b/lapack-netlib/SRC/zlaswlq.f
@@ -96,21 +96,23 @@
 *>          The leading dimension of the array T.  LDT >= MB.
 *> \endverbatim
 *>
-*>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
-*>
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *> \param[in] LWORK
 *> \verbatim
-*>          The dimension of the array WORK.  LWORK >= MB*M.
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= MB*M, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
-*>
 *> \endverbatim
+*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -158,33 +160,37 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup laswlq
+*>
 *  =====================================================================
       SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
-     $                  INFO)
+     $                    INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
+      INTEGER            INFO, LDA, M, N, MB, NB, LWORK, LDT
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), T( LDT, *)
+      COMPLEX*16         A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, MINMN, LWMIN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
       EXTERNAL           ZGELQT, ZTPLQT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -195,12 +201,19 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = M*MB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
       ELSE IF( N.LT.0 .OR. N.LT.M ) THEN
         INFO = -2
-      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 )) THEN
+      ELSE IF( MB.LT.1 .OR. ( MB.GT.M .AND. M.GT.0 ) ) THEN
         INFO = -3
       ELSE IF( NB.LE.0 ) THEN
         INFO = -4
@@ -208,60 +221,61 @@
         INFO = -6
       ELSE IF( LDT.LT.MB ) THEN
         INFO = -8
-      ELSE IF( ( LWORK.LT.M*MB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-      WORK(1) = MB*M
+*
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = LWMIN
       END IF
 *
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLASWLQ', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The LQ Decomposition
 *
-       IF((M.GE.N).OR.(NB.LE.M).OR.(NB.GE.N)) THEN
-        CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO)
+      IF( (M.GE.N) .OR. (NB.LE.M) .OR. (NB.GE.N) ) THEN
+        CALL ZGELQT( M, N, MB, A, LDA, T, LDT, WORK, INFO )
         RETURN
-       END IF
+      END IF
 *
-       KK = MOD((N-M),(NB-M))
-       II=N-KK+1
+      KK = MOD((N-M),(NB-M))
+      II = N-KK+1
 *
-*      Compute the LQ factorization of the first block A(1:M,1:NB)
+*     Compute the LQ factorization of the first block A(1:M,1:NB)
 *
-       CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO)
-       CTR = 1
+      CALL ZGELQT( M, NB, MB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = NB+1, II-NB+M , (NB-M)
+      DO I = NB+1, II-NB+M, (NB-M)
 *
-*      Compute the QR factorization of the current block A(1:M,I:I+NB-M)
+*       Compute the QR factorization of the current block A(1:M,I:I+NB-M)
 *
-         CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
-     $                  LDA, T(1, CTR * M + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL ZTPLQT( M, NB-M, 0, MB, A(1,1), LDA, A( 1, I ),
+     $                LDA, T(1, CTR * M + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
 *     Compute the QR factorization of the last block A(1:M,II:N)
 *
-       IF (II.LE.N) THEN
+      IF( II.LE.N ) THEN
         CALL ZTPLQT( M, KK, 0, MB, A(1,1), LDA, A( 1, II ),
-     $                  LDA, T(1, CTR * M + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+     $                LDA, T(1, CTR * M + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      WORK( 1 ) = M * MB
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLASWLQ
diff --git a/lapack-netlib/SRC/zlatdf.f b/lapack-netlib/SRC/zlatdf.f
index 25e71edce..41714a3e0 100644
--- a/lapack-netlib/SRC/zlatdf.f
+++ b/lapack-netlib/SRC/zlatdf.f
@@ -227,7 +227,7 @@
             BM = RHS( J ) - CONE
             SPLUS = ONE
 *
-*           Lockahead for L- part RHS(1:N-1) = +-1
+*           Look-ahead for L- part RHS(1:N-1) = +-1
 *           SPLUS and SMIN computed more efficiently than in BSOLVE[1].
 *
             SPLUS = SPLUS + DBLE( ZDOTC( N-J, Z( J+1, J ), 1, Z( J+1,
diff --git a/lapack-netlib/SRC/zlatrs3.f b/lapack-netlib/SRC/zlatrs3.f
index fc1be0517..27eac839b 100644
--- a/lapack-netlib/SRC/zlatrs3.f
+++ b/lapack-netlib/SRC/zlatrs3.f
@@ -158,7 +158,11 @@
 *> \endverbatim
 *>
 *> \param[in] LWORK
+*> \verbatim
 *>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>
+*>          If MIN(N,NRHS) = 0, LWORK >= 1, else
 *>          LWORK >= MAX(1, 2*NBA * MAX(NBA, MIN(NRHS, 32)), where
 *>          NBA = (N + NB - 1)/NB and NB is the optimal block size.
 *>
@@ -166,6 +170,7 @@
 *>          only calculates the optimal dimensions of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -182,7 +187,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup latrs3
 *> \par Further Details:
 *  =====================
 *  \verbatim
@@ -257,7 +262,7 @@
       LOGICAL            LQUERY, NOTRAN, NOUNIT, UPPER
       INTEGER            AWRK, I, IFIRST, IINC, ILAST, II, I1, I2, J,
      $                   JFIRST, JINC, JLAST, J1, J2, K, KK, K1, K2,
-     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS
+     $                   LANRM, LDS, LSCALE, NB, NBA, NBX, RHS, LWMIN
       DOUBLE PRECISION   ANRM, BIGNUM, BNRM, RSCAL, SCAL, SCALOC,
      $                   SCAMIN, SMLNUM, TMAX
 *     ..
@@ -296,15 +301,24 @@
 *     row. WORK( I + KK * LDS ) is the scale factor of the vector
 *     segment associated with the I-th block row and the KK-th vector
 *     in the block column.
+*
       LSCALE = NBA * MAX( NBA, MIN( NRHS, NBRHS ) )
       LDS = NBA
+*
 *     The second part stores upper bounds of the triangular A. There are
 *     a total of NBA x NBA blocks, of which only the upper triangular
 *     part or the lower triangular part is referenced. The upper bound of
 *     the block A( I, J ) is stored as WORK( AWRK + I + J * NBA ).
+*
       LANRM = NBA * NBA
       AWRK = LSCALE
-      WORK( 1 ) = LSCALE + LANRM
+*
+      IF( MIN( N, NRHS ).EQ.0 ) THEN
+         LWMIN = 1
+      ELSE
+         LWMIN = LSCALE + LANRM
+      END IF
+      WORK( 1 ) = LWMIN
 *
 *     Test the input parameters.
 *
@@ -326,7 +340,7 @@
          INFO = -8
       ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
          INFO = -10
-      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.WORK( 1 ) ) THEN
+      ELSE IF( .NOT.LQUERY .AND. LWORK.LT.LWMIN ) THEN
          INFO = -14
       END IF
       IF( INFO.NE.0 ) THEN
@@ -577,7 +591,7 @@
 *              Prepare the linear update to be executed with GEMM.
 *              For each column, compute a consistent scaling, a
 *              scaling factor to survive the linear update, and
-*              rescale the column segments, if necesssary. Then
+*              rescale the column segments, if necessary. Then
 *              the linear update is safely executed.
 *
                DO KK = 1, K2 - K1
diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f
index ffdbc68c0..24d00f28a 100644
--- a/lapack-netlib/SRC/zlatsqr.f
+++ b/lapack-netlib/SRC/zlatsqr.f
@@ -101,14 +101,18 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>         (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the minimal LWORK.
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          The dimension of the array WORK.  LWORK >= NB*N.
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK.
+*>          LWORK >= 1, if MIN(M,N) = 0, and LWORK >= NB*N, otherwise.
+*>
 *>          If LWORK = -1, then a workspace query is assumed; the routine
-*>          only calculates the optimal size of the WORK array, returns
+*>          only calculates the minimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
 *>          message related to LWORK is issued by XERBLA.
 *> \endverbatim
@@ -160,33 +164,37 @@
 *>     SIAM J. Sci. Comput, vol. 34, no. 1, 2012
 *> \endverbatim
 *>
+*> \ingroup latsqr
+*>
 *  =====================================================================
       SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
-     $                    LWORK, INFO)
+     $                    LWORK, INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *
 *     .. Scalar Arguments ..
-      INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
+      INTEGER            INFO, LDA, M, N, MB, NB, LDT, LWORK
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16        A( LDA, * ), WORK( * ), T(LDT, *)
+      COMPLEX*16         A( LDA, * ), WORK( * ), T( LDT, * )
 *     ..
 *
 *  =====================================================================
 *
 *     ..
 *     .. Local Scalars ..
-      LOGICAL    LQUERY
-      INTEGER    I, II, KK, CTR
+      LOGICAL            LQUERY
+      INTEGER            I, II, KK, CTR, LWMIN, MINMN
 *     ..
 *     .. EXTERNAL FUNCTIONS ..
       LOGICAL            LSAME
       EXTERNAL           LSAME
+*     ..
 *     .. EXTERNAL SUBROUTINES ..
-      EXTERNAL    ZGEQRT, ZTPQRT, XERBLA
+      EXTERNAL           ZGEQRT, ZTPQRT, XERBLA
+*     ..
 *     .. INTRINSIC FUNCTIONS ..
       INTRINSIC          MAX, MIN, MOD
 *     ..
@@ -197,6 +205,13 @@
       INFO = 0
 *
       LQUERY = ( LWORK.EQ.-1 )
+*
+      MINMN = MIN( M, N )
+      IF( MINMN.EQ.0 ) THEN
+        LWMIN = 1
+      ELSE
+        LWMIN = N*NB
+      END IF
 *
       IF( M.LT.0 ) THEN
         INFO = -1
@@ -204,64 +219,65 @@
         INFO = -2
       ELSE IF( MB.LT.1 ) THEN
         INFO = -3
-      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 )) THEN
+      ELSE IF( NB.LT.1 .OR. ( NB.GT.N .AND. N.GT.0 ) ) THEN
         INFO = -4
       ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
         INFO = -6
       ELSE IF( LDT.LT.NB ) THEN
         INFO = -8
-      ELSE IF( LWORK.LT.(N*NB) .AND. (.NOT.LQUERY) ) THEN
+      ELSE IF( LWORK.LT.LWMIN .AND. (.NOT.LQUERY) ) THEN
         INFO = -10
       END IF
-      IF( INFO.EQ.0)  THEN
-        WORK(1) = NB*N
+*
+      IF( INFO.EQ.0 )  THEN
+        WORK( 1 ) = LWMIN
       END IF
       IF( INFO.NE.0 ) THEN
         CALL XERBLA( 'ZLATSQR', -INFO )
         RETURN
-      ELSE IF (LQUERY) THEN
-       RETURN
+      ELSE IF( LQUERY ) THEN
+        RETURN
       END IF
 *
 *     Quick return if possible
 *
-      IF( MIN(M,N).EQ.0 ) THEN
-          RETURN
+      IF( MINMN.EQ.0 ) THEN
+        RETURN
       END IF
 *
 *     The QR Decomposition
 *
-       IF ((MB.LE.N).OR.(MB.GE.M)) THEN
-         CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO)
-         RETURN
-       END IF
-       KK = MOD((M-N),(MB-N))
-       II=M-KK+1
+      IF( (MB.LE.N) .OR. (MB.GE.M) ) THEN
+        CALL ZGEQRT( M, N, NB, A, LDA, T, LDT, WORK, INFO )
+        RETURN
+      END IF
+      KK = MOD((M-N),(MB-N))
+      II = M-KK+1
 *
-*      Compute the QR factorization of the first block A(1:MB,1:N)
+*     Compute the QR factorization of the first block A(1:MB,1:N)
 *
-       CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
-       CTR = 1
+      CALL ZGEQRT( MB, N, NB, A(1,1), LDA, T, LDT, WORK, INFO )
+      CTR = 1
 *
-       DO I = MB+1, II-MB+N ,  (MB-N)
+      DO I = MB+1, II-MB+N, (MB-N)
 *
-*      Compute the QR factorization of the current block A(I:I+MB-N,1:N)
+*       Compute the QR factorization of the current block A(I:I+MB-N,1:N)
 *
-         CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
-     $                 T(1, CTR * N + 1),
-     $                  LDT, WORK, INFO )
-         CTR = CTR + 1
-       END DO
+        CALL ZTPQRT( MB-N, N, 0, NB, A(1,1), LDA, A( I, 1 ), LDA,
+     $                T(1, CTR * N + 1),
+     $                LDT, WORK, INFO )
+        CTR = CTR + 1
+      END DO
 *
-*      Compute the QR factorization of the last block A(II:M,1:N)
+*     Compute the QR factorization of the last block A(II:M,1:N)
 *
-       IF (II.LE.M) THEN
-         CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
-     $                 T(1,CTR * N + 1), LDT,
-     $                  WORK, INFO )
-       END IF
+      IF( II.LE.M ) THEN
+        CALL ZTPQRT( KK, N, 0, NB, A(1,1), LDA, A( II, 1 ), LDA,
+     $                T(1,CTR * N + 1), LDT,
+     $                WORK, INFO )
+      END IF
 *
-      work( 1 ) = N*NB
+      WORK( 1 ) = LWMIN
       RETURN
 *
 *     End of ZLATSQR
diff --git a/lapack-netlib/SRC/zpbsv.f b/lapack-netlib/SRC/zpbsv.f
index fe6baf8b5..ef212bbc3 100644
--- a/lapack-netlib/SRC/zpbsv.f
+++ b/lapack-netlib/SRC/zpbsv.f
@@ -119,9 +119,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zpbsvx.f b/lapack-netlib/SRC/zpbsvx.f
index 1efd2fd1e..724102376 100644
--- a/lapack-netlib/SRC/zpbsvx.f
+++ b/lapack-netlib/SRC/zpbsvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular band matrix, and L is a lower
 *>    triangular band matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -280,10 +280,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zpbtf2.f b/lapack-netlib/SRC/zpbtf2.f
index 7b90af47e..39bf6c3fa 100644
--- a/lapack-netlib/SRC/zpbtf2.f
+++ b/lapack-netlib/SRC/zpbtf2.f
@@ -97,8 +97,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpbtrf.f b/lapack-netlib/SRC/zpbtrf.f
index 61cd1d434..80afb0836 100644
--- a/lapack-netlib/SRC/zpbtrf.f
+++ b/lapack-netlib/SRC/zpbtrf.f
@@ -92,8 +92,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpftrf.f b/lapack-netlib/SRC/zpftrf.f
index 6d18f2cb3..4d4a5116e 100644
--- a/lapack-netlib/SRC/zpftrf.f
+++ b/lapack-netlib/SRC/zpftrf.f
@@ -91,8 +91,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *>
 *>  Further Notes on RFP Format:
diff --git a/lapack-netlib/SRC/zposv.f b/lapack-netlib/SRC/zposv.f
index 3bb625876..0e91cde2f 100644
--- a/lapack-netlib/SRC/zposv.f
+++ b/lapack-netlib/SRC/zposv.f
@@ -110,9 +110,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zposvx.f b/lapack-netlib/SRC/zposvx.f
index f9e9b1d5f..6c06dbd57 100644
--- a/lapack-netlib/SRC/zposvx.f
+++ b/lapack-netlib/SRC/zposvx.f
@@ -70,7 +70,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -276,10 +276,10 @@
 *>          = 0: successful exit
 *>          < 0: if INFO = -i, the i-th argument had an illegal value
 *>          > 0: if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zposvxx.f b/lapack-netlib/SRC/zposvxx.f
index 6735fab71..3886c66f0 100644
--- a/lapack-netlib/SRC/zposvxx.f
+++ b/lapack-netlib/SRC/zposvxx.f
@@ -87,7 +87,7 @@
 *>    where U is an upper triangular matrix and L is a lower triangular
 *>    matrix.
 *>
-*>    3. If the leading i-by-i principal minor is not positive definite,
+*>    3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A (see argument RCOND).  If the reciprocal of the condition number
diff --git a/lapack-netlib/SRC/zpotf2.f b/lapack-netlib/SRC/zpotf2.f
index eb88d617c..b48f3d654 100644
--- a/lapack-netlib/SRC/zpotf2.f
+++ b/lapack-netlib/SRC/zpotf2.f
@@ -89,8 +89,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite, and the factorization could not be
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive, and the factorization could not be
 *>               completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpotrf.f b/lapack-netlib/SRC/zpotrf.f
index 3b7018276..3edae84fb 100644
--- a/lapack-netlib/SRC/zpotrf.f
+++ b/lapack-netlib/SRC/zpotrf.f
@@ -87,8 +87,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpotrf2.f b/lapack-netlib/SRC/zpotrf2.f
index 859ddc75f..67430f231 100644
--- a/lapack-netlib/SRC/zpotrf2.f
+++ b/lapack-netlib/SRC/zpotrf2.f
@@ -86,8 +86,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zppsv.f b/lapack-netlib/SRC/zppsv.f
index 19536e204..f466266e4 100644
--- a/lapack-netlib/SRC/zppsv.f
+++ b/lapack-netlib/SRC/zppsv.f
@@ -104,9 +104,9 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i of A is not
-*>                positive definite, so the factorization could not be
-*>                completed, and the solution has not been computed.
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                of A is not positive, so the factorization could not
+*>                be completed, and the solution has not been computed.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zppsvx.f b/lapack-netlib/SRC/zppsvx.f
index f94badf78..60d07cbc7 100644
--- a/lapack-netlib/SRC/zppsvx.f
+++ b/lapack-netlib/SRC/zppsvx.f
@@ -69,7 +69,7 @@
 *>    where U is an upper triangular matrix, L is a lower triangular
 *>    matrix, and **H indicates conjugate transpose.
 *>
-*> 3. If the leading i-by-i principal minor is not positive definite,
+*> 3. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -262,10 +262,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zpptrf.f b/lapack-netlib/SRC/zpptrf.f
index a34d63913..a103e5eca 100644
--- a/lapack-netlib/SRC/zpptrf.f
+++ b/lapack-netlib/SRC/zpptrf.f
@@ -79,8 +79,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the factorization could not be
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the factorization could not be
 *>                completed.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zpteqr.f b/lapack-netlib/SRC/zpteqr.f
index a81a6ad94..897136c76 100644
--- a/lapack-netlib/SRC/zpteqr.f
+++ b/lapack-netlib/SRC/zpteqr.f
@@ -123,8 +123,8 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, and i is:
 *>                <= N  the Cholesky factorization of the matrix could
-*>                      not be performed because the i-th principal minor
-*>                      was not positive definite.
+*>                      not be performed because the leading principal
+*>                      minor of order i was not positive.
 *>                > N   the SVD algorithm failed to converge;
 *>                      if INFO = N+i, i off-diagonal elements of the
 *>                      bidiagonal factor did not converge to zero.
diff --git a/lapack-netlib/SRC/zptsv.f b/lapack-netlib/SRC/zptsv.f
index d53e17a34..191adaff6 100644
--- a/lapack-netlib/SRC/zptsv.f
+++ b/lapack-netlib/SRC/zptsv.f
@@ -94,8 +94,8 @@
 *>          INFO is INTEGER
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
-*>          > 0:  if INFO = i, the leading minor of order i is not
-*>                positive definite, and the solution has not been
+*>          > 0:  if INFO = i, the leading principal minor of order i
+*>                is not positive, and the solution has not been
 *>                computed.  The factorization has not been completed
 *>                unless i = N.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zptsvx.f b/lapack-netlib/SRC/zptsvx.f
index 4a40768f1..94409a7a1 100644
--- a/lapack-netlib/SRC/zptsvx.f
+++ b/lapack-netlib/SRC/zptsvx.f
@@ -60,7 +60,7 @@
 *>    factorization can also be regarded as having the form
 *>    A = U**H*D*U.
 *>
-*> 2. If the leading i-by-i principal minor is not positive definite,
+*> 2. If the leading principal minor of order i is not positive,
 *>    then the routine returns with INFO = i. Otherwise, the factored
 *>    form of A is used to estimate the condition number of the matrix
 *>    A.  If the reciprocal of the condition number is less than machine
@@ -205,10 +205,10 @@
 *>          = 0:  successful exit
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          > 0:  if INFO = i, and i is
-*>                <= N:  the leading minor of order i of A is
-*>                       not positive definite, so the factorization
-*>                       could not be completed, and the solution has not
-*>                       been computed. RCOND = 0 is returned.
+*>                <= N:  the leading principal minor of order i of A
+*>                       is not positive, so the factorization could not
+*>                       be completed, and the solution has not been
+*>                       computed. RCOND = 0 is returned.
 *>                = N+1: U is nonsingular, but RCOND is less than machine
 *>                       precision, meaning that the matrix is singular
 *>                       to working precision.  Nevertheless, the
diff --git a/lapack-netlib/SRC/zpttrf.f b/lapack-netlib/SRC/zpttrf.f
index a106ec419..75ef847c4 100644
--- a/lapack-netlib/SRC/zpttrf.f
+++ b/lapack-netlib/SRC/zpttrf.f
@@ -71,8 +71,8 @@
 *>          INFO is INTEGER
 *>          = 0: successful exit
 *>          < 0: if INFO = -k, the k-th argument had an illegal value
-*>          > 0: if INFO = k, the leading minor of order k is not
-*>               positive definite; if k < N, the factorization could not
+*>          > 0: if INFO = k, the leading principal minor of order k
+*>               is not positive; if k < N, the factorization could not
 *>               be completed, while if k = N, the factorization was
 *>               completed, but D(N) <= 0.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zrscl.c b/lapack-netlib/SRC/zrscl.c
new file mode 100644
index 000000000..2264b5465
--- /dev/null
+++ b/lapack-netlib/SRC/zrscl.c
@@ -0,0 +1,735 @@
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <complex.h>
+#ifdef complex
+#undef complex
+#endif
+#ifdef I
+#undef I
+#endif
+
+#if defined(_WIN64)
+typedef long long BLASLONG;
+typedef unsigned long long BLASULONG;
+#else
+typedef long BLASLONG;
+typedef unsigned long BLASULONG;
+#endif
+
+#ifdef LAPACK_ILP64
+typedef BLASLONG blasint;
+#if defined(_WIN64)
+#define blasabs(x) llabs(x)
+#else
+#define blasabs(x) labs(x)
+#endif
+#else
+typedef int blasint;
+#define blasabs(x) abs(x)
+#endif
+
+typedef blasint integer;
+
+typedef unsigned int uinteger;
+typedef char *address;
+typedef short int shortint;
+typedef float real;
+typedef double doublereal;
+typedef struct { real r, i; } complex;
+typedef struct { doublereal r, i; } doublecomplex;
+#ifdef _MSC_VER
+static inline _Fcomplex Cf(complex *z) {_Fcomplex zz={z->r , z->i}; return zz;}
+static inline _Dcomplex Cd(doublecomplex *z) {_Dcomplex zz={z->r , z->i};return zz;}
+static inline _Fcomplex * _pCf(complex *z) {return (_Fcomplex*)z;}
+static inline _Dcomplex * _pCd(doublecomplex *z) {return (_Dcomplex*)z;}
+#else
+static inline _Complex float Cf(complex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex double Cd(doublecomplex *z) {return z->r + z->i*_Complex_I;}
+static inline _Complex float * _pCf(complex *z) {return (_Complex float*)z;}
+static inline _Complex double * _pCd(doublecomplex *z) {return (_Complex double*)z;}
+#endif
+#define pCf(z) (*_pCf(z))
+#define pCd(z) (*_pCd(z))
+typedef int logical;
+typedef short int shortlogical;
+typedef char logical1;
+typedef char integer1;
+
+#define TRUE_ (1)
+#define FALSE_ (0)
+
+/* Extern is for use with -E */
+#ifndef Extern
+#define Extern extern
+#endif
+
+/* I/O stuff */
+
+typedef int flag;
+typedef int ftnlen;
+typedef int ftnint;
+
+/*external read, write*/
+typedef struct
+{	flag cierr;
+	ftnint ciunit;
+	flag ciend;
+	char *cifmt;
+	ftnint cirec;
+} cilist;
+
+/*internal read, write*/
+typedef struct
+{	flag icierr;
+	char *iciunit;
+	flag iciend;
+	char *icifmt;
+	ftnint icirlen;
+	ftnint icirnum;
+} icilist;
+
+/*open*/
+typedef struct
+{	flag oerr;
+	ftnint ounit;
+	char *ofnm;
+	ftnlen ofnmlen;
+	char *osta;
+	char *oacc;
+	char *ofm;
+	ftnint orl;
+	char *oblnk;
+} olist;
+
+/*close*/
+typedef struct
+{	flag cerr;
+	ftnint cunit;
+	char *csta;
+} cllist;
+
+/*rewind, backspace, endfile*/
+typedef struct
+{	flag aerr;
+	ftnint aunit;
+} alist;
+
+/* inquire */
+typedef struct
+{	flag inerr;
+	ftnint inunit;
+	char *infile;
+	ftnlen infilen;
+	ftnint	*inex;	/*parameters in standard's order*/
+	ftnint	*inopen;
+	ftnint	*innum;
+	ftnint	*innamed;
+	char	*inname;
+	ftnlen	innamlen;
+	char	*inacc;
+	ftnlen	inacclen;
+	char	*inseq;
+	ftnlen	inseqlen;
+	char 	*indir;
+	ftnlen	indirlen;
+	char	*infmt;
+	ftnlen	infmtlen;
+	char	*inform;
+	ftnint	informlen;
+	char	*inunf;
+	ftnlen	inunflen;
+	ftnint	*inrecl;
+	ftnint	*innrec;
+	char	*inblank;
+	ftnlen	inblanklen;
+} inlist;
+
+#define VOID void
+
+union Multitype {	/* for multiple entry points */
+	integer1 g;
+	shortint h;
+	integer i;
+	/* longint j; */
+	real r;
+	doublereal d;
+	complex c;
+	doublecomplex z;
+	};
+
+typedef union Multitype Multitype;
+
+struct Vardesc {	/* for Namelist */
+	char *name;
+	char *addr;
+	ftnlen *dims;
+	int  type;
+	};
+typedef struct Vardesc Vardesc;
+
+struct Namelist {
+	char *name;
+	Vardesc **vars;
+	int nvars;
+	};
+typedef struct Namelist Namelist;
+
+#define abs(x) ((x) >= 0 ? (x) : -(x))
+#define dabs(x) (fabs(x))
+#define f2cmin(a,b) ((a) <= (b) ? (a) : (b))
+#define f2cmax(a,b) ((a) >= (b) ? (a) : (b))
+#define dmin(a,b) (f2cmin(a,b))
+#define dmax(a,b) (f2cmax(a,b))
+#define bit_test(a,b)	((a) >> (b) & 1)
+#define bit_clear(a,b)	((a) & ~((uinteger)1 << (b)))
+#define bit_set(a,b)	((a) |  ((uinteger)1 << (b)))
+
+#define abort_() { sig_die("Fortran abort routine called", 1); }
+#define c_abs(z) (cabsf(Cf(z)))
+#define c_cos(R,Z) { pCf(R)=ccos(Cf(Z)); }
+#ifdef _MSC_VER
+#define c_div(c, a, b) {Cf(c)._Val[0] = (Cf(a)._Val[0]/Cf(b)._Val[0]); Cf(c)._Val[1]=(Cf(a)._Val[1]/Cf(b)._Val[1]);}
+#define z_div(c, a, b) {Cd(c)._Val[0] = (Cd(a)._Val[0]/Cd(b)._Val[0]); Cd(c)._Val[1]=(Cd(a)._Val[1]/Cd(b)._Val[1]);}
+#else
+#define c_div(c, a, b) {pCf(c) = Cf(a)/Cf(b);}
+#define z_div(c, a, b) {pCd(c) = Cd(a)/Cd(b);}
+#endif
+#define c_exp(R, Z) {pCf(R) = cexpf(Cf(Z));}
+#define c_log(R, Z) {pCf(R) = clogf(Cf(Z));}
+#define c_sin(R, Z) {pCf(R) = csinf(Cf(Z));}
+//#define c_sqrt(R, Z) {*(R) = csqrtf(Cf(Z));}
+#define c_sqrt(R, Z) {pCf(R) = csqrtf(Cf(Z));}
+#define d_abs(x) (fabs(*(x)))
+#define d_acos(x) (acos(*(x)))
+#define d_asin(x) (asin(*(x)))
+#define d_atan(x) (atan(*(x)))
+#define d_atn2(x, y) (atan2(*(x),*(y)))
+#define d_cnjg(R, Z) { pCd(R) = conj(Cd(Z)); }
+#define r_cnjg(R, Z) { pCf(R) = conjf(Cf(Z)); }
+#define d_cos(x) (cos(*(x)))
+#define d_cosh(x) (cosh(*(x)))
+#define d_dim(__a, __b) ( *(__a) > *(__b) ? *(__a) - *(__b) : 0.0 )
+#define d_exp(x) (exp(*(x)))
+#define d_imag(z) (cimag(Cd(z)))
+#define r_imag(z) (cimagf(Cf(z)))
+#define d_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define r_int(__x) (*(__x)>0 ? floor(*(__x)) : -floor(- *(__x)))
+#define d_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define r_lg10(x) ( 0.43429448190325182765 * log(*(x)) )
+#define d_log(x) (log(*(x)))
+#define d_mod(x, y) (fmod(*(x), *(y)))
+#define u_nint(__x) ((__x)>=0 ? floor((__x) + .5) : -floor(.5 - (__x)))
+#define d_nint(x) u_nint(*(x))
+#define u_sign(__a,__b) ((__b) >= 0 ? ((__a) >= 0 ? (__a) : -(__a)) : -((__a) >= 0 ? (__a) : -(__a)))
+#define d_sign(a,b) u_sign(*(a),*(b))
+#define r_sign(a,b) u_sign(*(a),*(b))
+#define d_sin(x) (sin(*(x)))
+#define d_sinh(x) (sinh(*(x)))
+#define d_sqrt(x) (sqrt(*(x)))
+#define d_tan(x) (tan(*(x)))
+#define d_tanh(x) (tanh(*(x)))
+#define i_abs(x) abs(*(x))
+#define i_dnnt(x) ((integer)u_nint(*(x)))
+#define i_len(s, n) (n)
+#define i_nint(x) ((integer)u_nint(*(x)))
+#define i_sign(a,b) ((integer)u_sign((integer)*(a),(integer)*(b)))
+#define pow_dd(ap, bp) ( pow(*(ap), *(bp)))
+#define pow_si(B,E) spow_ui(*(B),*(E))
+#define pow_ri(B,E) spow_ui(*(B),*(E))
+#define pow_di(B,E) dpow_ui(*(B),*(E))
+#define pow_zi(p, a, b) {pCd(p) = zpow_ui(Cd(a), *(b));}
+#define pow_ci(p, a, b) {pCf(p) = cpow_ui(Cf(a), *(b));}
+#define pow_zz(R,A,B) {pCd(R) = cpow(Cd(A),*(B));}
+#define s_cat(lpp, rpp, rnp, np, llp) { 	ftnlen i, nc, ll; char *f__rp, *lp; 	ll = (llp); lp = (lpp); 	for(i=0; i < (int)*(np); ++i) {         	nc = ll; 	        if((rnp)[i] < nc) nc = (rnp)[i]; 	        ll -= nc;         	f__rp = (rpp)[i]; 	        while(--nc >= 0) *lp++ = *(f__rp)++;         } 	while(--ll >= 0) *lp++ = ' '; }
+#define s_cmp(a,b,c,d) ((integer)strncmp((a),(b),f2cmin((c),(d))))
+#define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
+#define sig_die(s, kill) { exit(1); }
+#define s_stop(s, n) {exit(0);}
+static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
+#define z_abs(z) (cabs(Cd(z)))
+#define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
+#define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
+#define myexit_() break;
+#define mycycle_() continue;
+#define myceiling_(w) {ceil(w)}
+#define myhuge_(w) {HUGE_VAL}
+//#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);}
+#define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n)
+
+/* procedure parameter types for -A and -C++ */
+
+#define F2C_proc_par_types 1
+#ifdef __cplusplus
+typedef logical (*L_fp)(...);
+#else
+typedef logical (*L_fp)();
+#endif
+
+static float spow_ui(float x, integer n) {
+	float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static double dpow_ui(double x, integer n) {
+	double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#ifdef _MSC_VER
+static _Fcomplex cpow_ui(complex x, integer n) {
+	complex pow={1.0,0.0}; unsigned long int u;
+		if(n != 0) {
+		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
+		for(u = n; ; ) {
+			if(u & 01) pow.r *= x.r, pow.i *= x.i;
+			if(u >>= 1) x.r *= x.r, x.i *= x.i;
+			else break;
+		}
+	}
+	_Fcomplex p={pow.r, pow.i};
+	return p;
+}
+#else
+static _Complex float cpow_ui(_Complex float x, integer n) {
+	_Complex float pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+#ifdef _MSC_VER
+static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
+	_Dcomplex pow={1.0,0.0}; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
+		for(u = n; ; ) {
+			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
+			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
+			else break;
+		}
+	}
+	_Dcomplex p = {pow._Val[0], pow._Val[1]};
+	return p;
+}
+#else
+static _Complex double zpow_ui(_Complex double x, integer n) {
+	_Complex double pow=1.0; unsigned long int u;
+	if(n != 0) {
+		if(n < 0) n = -n, x = 1/x;
+		for(u = n; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+#endif
+static integer pow_ii(integer x, integer n) {
+	integer pow; unsigned long int u;
+	if (n <= 0) {
+		if (n == 0 || x == 1) pow = 1;
+		else if (x != -1) pow = x == 0 ? 1/x : 0;
+		else n = -n;
+	}
+	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
+		u = n;
+		for(pow = 1; ; ) {
+			if(u & 01) pow *= x;
+			if(u >>= 1) x *= x;
+			else break;
+		}
+	}
+	return pow;
+}
+static integer dmaxloc_(double *w, integer s, integer e, integer *n)
+{
+	double m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static integer smaxloc_(float *w, integer s, integer e, integer *n)
+{
+	float m; integer i, mi;
+	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
+		if (w[i-1]>m) mi=i ,m=w[i-1];
+	return mi-s+1;
+}
+static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif	
+static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Fcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
+			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
+		}
+	}
+	pCf(z) = zdotc;
+}
+#else
+	_Complex float zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i]) * Cf(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
+		}
+	}
+	pCf(z) = zdotc;
+}
+#endif
+static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
+	integer n = *n_, incx = *incx_, incy = *incy_, i;
+#ifdef _MSC_VER
+	_Dcomplex zdotc = {0.0, 0.0};
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
+			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
+			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
+		}
+	}
+	pCd(z) = zdotc;
+}
+#else
+	_Complex double zdotc = 0.0;
+	if (incx == 1 && incy == 1) {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i]) * Cd(&y[i]);
+		}
+	} else {
+		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
+			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
+		}
+	}
+	pCd(z) = zdotc;
+}
+#endif
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/*  -- translated by f2c (version 20000121).
+   You must link the resulting object file with the libraries:
+	-lf2c -lm   (in that order)
+*/
+
+
+
+/* > \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar. */
+
+/*  =========== DOCUMENTATION =========== */
+
+/* Online html documentation available at */
+/*            http://www.netlib.org/lapack/explore-html/ */
+
+/* > \htmlonly */
+/* > Download ZDRSCL + dependencies */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [TGZ]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [ZIP]</a> */
+/* > <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.
+f"> */
+/* > [TXT]</a> */
+/* > \endhtmlonly */
+
+/*  Definition: */
+/*  =========== */
+
+/*       SUBROUTINE ZRSCL( N, A, X, INCX ) */
+
+/*       INTEGER            INCX, N */
+/*       COMPLEX*16         A */
+/*       COMPLEX*16         X( * ) */
+
+
+/* > \par Purpose: */
+/*  ============= */
+/* > */
+/* > \verbatim */
+/* > */
+/* > ZRSCL multiplies an n-element complex vector x by the complex scalar */
+/* > 1/a.  This is done without overflow or underflow as long as */
+/* > the final result x/a does not overflow or underflow. */
+/* > \endverbatim */
+
+/*  Arguments: */
+/*  ========== */
+
+/* > \param[in] N */
+/* > \verbatim */
+/* >          N is INTEGER */
+/* >          The number of components of the vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] A */
+/* > \verbatim */
+/* >          A is COMPLEX*16 */
+/* >          The scalar a which is used to divide each component of x. */
+/* >          A must not be 0, or the subroutine will divide by zero. */
+/* > \endverbatim */
+/* > */
+/* > \param[in,out] X */
+/* > \verbatim */
+/* >          X is COMPLEX*16 array, dimension */
+/* >                         (1+(N-1)*abs(INCX)) */
+/* >          The n-element vector x. */
+/* > \endverbatim */
+/* > */
+/* > \param[in] INCX */
+/* > \verbatim */
+/* >          INCX is INTEGER */
+/* >          The increment between successive values of the vector SX. */
+/* >          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n */
+/* > \endverbatim */
+
+/*  Authors: */
+/*  ======== */
+
+/* > \author Univ. of Tennessee */
+/* > \author Univ. of California Berkeley */
+/* > \author Univ. of Colorado Denver */
+/* > \author NAG Ltd. */
+
+/* > \ingroup complex16OTHERauxiliary */
+
+/*  ===================================================================== */
+/* Subroutine */ int zrscl_(integer *n, doublecomplex *a, doublecomplex *x, 
+	integer *incx)
+{
+    /* System generated locals */
+    doublereal d__1, d__2;
+    doublecomplex z__1;
+
+    /* Local variables */
+    doublereal absi, absr;
+    extern /* Subroutine */ int zscal_(integer *, doublecomplex *, 
+	    doublecomplex *, integer *);
+    doublereal ai, ar;
+    extern doublereal dlamch_(char *);
+    doublereal ui, ov, ur, safmin, safmax;
+    extern /* Subroutine */ int zdscal_(integer *, doublereal *, 
+	    doublecomplex *, integer *), zdrscl_(integer *, doublereal *, 
+	    doublecomplex *, integer *);
+
+
+/*  -- LAPACK auxiliary routine -- */
+/*  -- LAPACK is a software package provided by Univ. of Tennessee,    -- */
+/*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- */
+
+
+/* ===================================================================== */
+
+
+/*     Quick return if possible */
+
+    /* Parameter adjustments */
+    --x;
+
+    /* Function Body */
+    if (*n <= 0) {
+	return 0;
+    }
+
+/*     Get machine parameters */
+
+    safmin = dlamch_("S");
+    safmax = 1. / safmin;
+    ov = dlamch_("O");
+
+/*     Initialize constants related to A. */
+
+    ar = a->r;
+    ai = d_imag(a);
+    absr = abs(ar);
+    absi = abs(ai);
+
+    if (ai == 0.) {
+/*        If alpha is real, then we can use csrscl */
+	zdrscl_(n, &ar, &x[1], incx);
+
+    } else if (ar == 0.) {
+/*        If alpha has a zero real part, then we follow the same rules as if */
+/*        alpha were real. */
+	if (absi > safmax) {
+	    zdscal_(n, &safmin, &x[1], incx);
+	    d__1 = -safmax / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	} else if (absi < safmin) {
+	    d__1 = -safmin / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	    zdscal_(n, &safmax, &x[1], incx);
+	} else {
+	    d__1 = -1. / ai;
+	    z__1.r = 0., z__1.i = d__1;
+	    zscal_(n, &z__1, &x[1], incx);
+	}
+
+    } else {
+/*        The following numbers can be computed. */
+/*        They are the inverse of the real and imaginary parts of 1/alpha. */
+/*        Note that a and b are always different from zero. */
+/*        NaNs are only possible if either: */
+/*        1. alphaR or alphaI is NaN. */
+/*        2. alphaR and alphaI are both infinite, in which case it makes sense */
+/*        to propagate a NaN. */
+	ur = ar + ai * (ai / ar);
+	ui = ai + ar * (ar / ai);
+
+	if (abs(ur) < safmin || abs(ui) < safmin) {
+/*           This means that both alphaR and alphaI are very small. */
+	    d__1 = safmin / ur;
+	    d__2 = -safmin / ui;
+	    z__1.r = d__1, z__1.i = d__2;
+	    zscal_(n, &z__1, &x[1], incx);
+	    zdscal_(n, &safmax, &x[1], incx);
+	} else if (abs(ur) > safmax || abs(ui) > safmax) {
+	    if (absr > ov || absi > ov) {
+/*              This means that a and b are both Inf. No need for scaling. */
+		d__1 = 1. / ur;
+		d__2 = -1. / ui;
+		z__1.r = d__1, z__1.i = d__2;
+		zscal_(n, &z__1, &x[1], incx);
+	    } else {
+		zdscal_(n, &safmin, &x[1], incx);
+		if (abs(ur) > ov || abs(ui) > ov) {
+/*                 Infs were generated. We do proper scaling to avoid them. */
+		    if (absr >= absi) {
+/*                    ABS( UR ) <= ABS( UI ) */
+			ur = safmin * ar + safmin * (ai * (ai / ar));
+			ui = safmin * ai + ar * (safmin * ar / ai);
+		    } else {
+/*                    ABS( UR ) > ABS( UI ) */
+			ur = safmin * ar + ai * (safmin * ai / ar);
+			ui = safmin * ai + safmin * (ar * (ar / ai));
+		    }
+		    d__1 = 1. / ur;
+		    d__2 = -1. / ui;
+		    z__1.r = d__1, z__1.i = d__2;
+		    zscal_(n, &z__1, &x[1], incx);
+		} else {
+		    d__1 = safmax / ur;
+		    d__2 = -safmax / ui;
+		    z__1.r = d__1, z__1.i = d__2;
+		    zscal_(n, &z__1, &x[1], incx);
+		}
+	    }
+	} else {
+	    d__1 = 1. / ur;
+	    d__2 = -1. / ui;
+	    z__1.r = d__1, z__1.i = d__2;
+	    zscal_(n, &z__1, &x[1], incx);
+	}
+    }
+
+    return 0;
+
+/*     End of ZRSCL */
+
+} /* zrscl_ */
+
diff --git a/lapack-netlib/SRC/zrscl.f b/lapack-netlib/SRC/zrscl.f
new file mode 100644
index 000000000..970f6de75
--- /dev/null
+++ b/lapack-netlib/SRC/zrscl.f
@@ -0,0 +1,203 @@
+*> \brief \b ZDRSCL multiplies a vector by the reciprocal of a real scalar.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZDRSCL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zdrscl.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zdrscl.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zdrscl.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZRSCL( N, A, X, INCX )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INCX, N
+*       COMPLEX*16         A
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         X( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZRSCL multiplies an n-element complex vector x by the complex scalar
+*> 1/a.  This is done without overflow or underflow as long as
+*> the final result x/a does not overflow or underflow.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of components of the vector x.
+*> \endverbatim
+*>
+*> \param[in] A
+*> \verbatim
+*>          A is COMPLEX*16
+*>          The scalar a which is used to divide each component of x.
+*>          A must not be 0, or the subroutine will divide by zero.
+*> \endverbatim
+*>
+*> \param[in,out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension
+*>                         (1+(N-1)*abs(INCX))
+*>          The n-element vector x.
+*> \endverbatim
+*>
+*> \param[in] INCX
+*> \verbatim
+*>          INCX is INTEGER
+*>          The increment between successive values of the vector SX.
+*>          > 0:  SX(1) = X(1) and SX(1+(i-1)*INCX) = x(i),     1< i<= n
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE ZRSCL( N, A, X, INCX )
+*
+*  -- LAPACK auxiliary routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            INCX, N
+      COMPLEX*16         A
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         X( * )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   SAFMAX, SAFMIN, OV, AR, AI, ABSR, ABSI, UR, UI
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH
+      COMPLEX*16         ZLADIV
+      EXTERNAL           DLAMCH, ZLADIV
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DSCAL, ZDSCAL, ZDRSCL
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS
+*     ..
+*     .. Executable Statements ..
+*
+*     Quick return if possible
+*
+      IF( N.LE.0 )
+     $   RETURN
+*
+*     Get machine parameters
+*
+      SAFMIN = DLAMCH( 'S' )
+      SAFMAX = ONE / SAFMIN
+      OV   = DLAMCH( 'O' )
+*
+*     Initialize constants related to A.
+*
+      AR = DBLE( A )
+      AI = DIMAG( A )
+      ABSR = ABS( AR )
+      ABSI = ABS( AI )
+*
+      IF( AI.EQ.ZERO ) THEN
+*        If alpha is real, then we can use csrscl
+         CALL ZDRSCL( N, AR, X, INCX )
+*
+      ELSE IF( AR.EQ.ZERO ) THEN
+*        If alpha has a zero real part, then we follow the same rules as if
+*        alpha were real.
+         IF( ABSI.GT.SAFMAX ) THEN
+            CALL ZDSCAL( N, SAFMIN, X, INCX )
+            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMAX / AI ), X, INCX )
+         ELSE IF( ABSI.LT.SAFMIN ) THEN
+            CALL ZSCAL( N, DCMPLX( ZERO, -SAFMIN / AI ), X, INCX )
+            CALL ZDSCAL( N, SAFMAX, X, INCX )
+         ELSE
+            CALL ZSCAL( N, DCMPLX( ZERO, -ONE / AI ), X, INCX )
+         END IF
+*
+      ELSE
+*        The following numbers can be computed.
+*        They are the inverse of the real and imaginary parts of 1/alpha.
+*        Note that a and b are always different from zero.
+*        NaNs are only possible if either:
+*        1. alphaR or alphaI is NaN.
+*        2. alphaR and alphaI are both infinite, in which case it makes sense
+*        to propagate a NaN.
+         UR = AR + AI * ( AI / AR )
+         UI = AI + AR * ( AR / AI )
+*
+         IF( (ABS( UR ).LT.SAFMIN).OR.(ABS( UI ).LT.SAFMIN) ) THEN
+*           This means that both alphaR and alphaI are very small.
+            CALL ZSCAL( N, DCMPLX( SAFMIN / UR, -SAFMIN / UI ), X,
+     $                  INCX )
+            CALL ZDSCAL( N, SAFMAX, X, INCX )
+         ELSE IF( (ABS( UR ).GT.SAFMAX).OR.(ABS( UI ).GT.SAFMAX) ) THEN
+            IF( (ABSR.GT.OV).OR.(ABSI.GT.OV) ) THEN
+*              This means that a and b are both Inf. No need for scaling.
+               CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
+            ELSE
+               CALL ZDSCAL( N, SAFMIN, X, INCX )
+               IF( (ABS( UR ).GT.OV).OR.(ABS( UI ).GT.OV) ) THEN
+*                 Infs were generated. We do proper scaling to avoid them.
+                  IF( ABSR.GE.ABSI ) THEN
+*                    ABS( UR ) <= ABS( UI )
+                     UR = (SAFMIN * AR) + SAFMIN * (AI * ( AI / AR ))
+                     UI = (SAFMIN * AI) + AR * ( (SAFMIN * AR) / AI )
+                  ELSE
+*                    ABS( UR ) > ABS( UI )
+                     UR = (SAFMIN * AR) + AI * ( (SAFMIN * AI) / AR )
+                     UI = (SAFMIN * AI) + SAFMIN * (AR * ( AR / AI ))
+                  END IF
+                  CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X,
+     $                        INCX )
+               ELSE
+                  CALL ZSCAL( N, DCMPLX( SAFMAX / UR, -SAFMAX / UI ),
+     $                        X, INCX )
+               END IF
+            END IF
+         ELSE
+            CALL ZSCAL( N, DCMPLX( ONE / UR, -ONE / UI ), X, INCX )
+         END IF
+      END IF
+*
+      RETURN
+*
+*     End of ZRSCL
+*
+      END
diff --git a/lapack-netlib/SRC/zstedc.f b/lapack-netlib/SRC/zstedc.f
index 74d390af7..e62063a19 100644
--- a/lapack-netlib/SRC/zstedc.f
+++ b/lapack-netlib/SRC/zstedc.f
@@ -43,12 +43,6 @@
 *> be found if ZHETRD or ZHPTRD or ZHBTRD has been used to reduce this
 *> matrix to tridiagonal form.
 *>
-*> This code makes very mild assumptions about floating point
-*> arithmetic. It will work on machines with a guard digit in
-*> add/subtract, or on those binary machines without guard digits
-*> which subtract like the Cray X-MP, Cray Y-MP, Cray C-90, or Cray-2.
-*> It could conceivably fail on hexadecimal or decimal machines
-*> without guard digits, but we know of none.  See DLAED3 for details.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zstegr.f b/lapack-netlib/SRC/zstegr.f
index ee43a0d21..3736a0517 100644
--- a/lapack-netlib/SRC/zstegr.f
+++ b/lapack-netlib/SRC/zstegr.f
@@ -56,7 +56,7 @@
 *>
 *> Note : ZSTEGR and ZSTEMR work only on machines which follow
 *> IEEE-754 floating-point standard in their handling of infinities and
-*> NaNs.  Normal execution may create these exceptiona values and hence
+*> NaNs.  Normal execution may create these exceptional values and hence
 *> may abort due to a floating point exception in environments which
 *> do not conform to the IEEE-754 standard.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zstemr.f b/lapack-netlib/SRC/zstemr.f
index b034198de..4eaf5ef97 100644
--- a/lapack-netlib/SRC/zstemr.f
+++ b/lapack-netlib/SRC/zstemr.f
@@ -320,7 +320,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup stemr
 *
 *> \par Contributors:
 *  ==================
@@ -330,6 +330,7 @@
 *> Inderjit Dhillon, University of Texas, Austin, USA \n
 *> Osni Marques, LBNL/NERSC, USA \n
 *> Christof Voemel, University of California, Berkeley, USA \n
+*> Aravindh Krishnamoorthy, FAU, Erlangen, Germany \n
 *
 *  =====================================================================
       SUBROUTINE ZSTEMR( JOBZ, RANGE, N, D, E, VL, VU, IL, IU,
@@ -361,7 +362,8 @@
      $                     MINRGP = 1.0D-3 )
 *     ..
 *     .. Local Scalars ..
-      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY
+      LOGICAL            ALLEIG, INDEIG, LQUERY, VALEIG, WANTZ, ZQUERY,
+     $                   LAESWAP
       INTEGER            I, IBEGIN, IEND, IFIRST, IIL, IINDBL, IINDW,
      $                   IINDWK, IINFO, IINSPL, IIU, ILAST, IN, INDD,
      $                   INDE2, INDERR, INDGP, INDGRS, INDWRK, ITMP,
@@ -397,6 +399,7 @@
 *
       LQUERY = ( ( LWORK.EQ.-1 ).OR.( LIWORK.EQ.-1 ) )
       ZQUERY = ( NZC.EQ.-1 )
+      LAESWAP = .FALSE.
 
 *     DSTEMR needs WORK of size 6*N, IWORK of size 3*N.
 *     In addition, DLARRE needs WORK of size 6*N, IWORK of size 5*N.
@@ -519,6 +522,15 @@
          ELSE IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
             CALL DLAEV2( D(1), E(1), D(2), R1, R2, CS, SN )
          END IF
+*        D/S/LAE2 and D/S/LAEV2 outputs satisfy |R1| >= |R2|. However,
+*        the following code requires R1 >= R2. Hence, we correct
+*        the order of R1, R2, CS, SN if R1 < R2 before further processing.
+         IF( R1.LT.R2 ) THEN
+            E(2) = R1
+            R1 = R2
+            R2 = E(2)
+            LAESWAP = .TRUE.
+         ENDIF
          IF( ALLEIG.OR.
      $      (VALEIG.AND.(R2.GT.WL).AND.
      $                  (R2.LE.WU)).OR.
@@ -526,8 +538,13 @@
             M = M+1
             W( M ) = R2
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = -SN
-               Z( 2, M ) = CS
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ELSE
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
@@ -550,8 +567,13 @@
             M = M+1
             W( M ) = R1
             IF( WANTZ.AND.(.NOT.ZQUERY) ) THEN
-               Z( 1, M ) = CS
-               Z( 2, M ) = SN
+               IF( LAESWAP ) THEN
+                  Z( 1, M ) = -SN
+                  Z( 2, M ) = CS
+               ELSE
+                  Z( 1, M ) = CS
+                  Z( 2, M ) = SN
+               ENDIF
 *              Note: At most one of SN and CS can be zero.
                IF (SN.NE.ZERO) THEN
                   IF (CS.NE.ZERO) THEN
diff --git a/lapack-netlib/SRC/zsyconvf.f b/lapack-netlib/SRC/zsyconvf.f
index eb49b0f3d..0958a5f77 100644
--- a/lapack-netlib/SRC/zsyconvf.f
+++ b/lapack-netlib/SRC/zsyconvf.f
@@ -39,7 +39,7 @@
 *> ZSYTRF provided on entry in parameter A into the factorization
 *> output format used in ZSYTRF_RK (or ZSYTRF_BK) that is stored
 *> on exit in parameters A and E. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in ZSYTRF into
+*> the interchanges stored in IPIV from the format used in ZSYTRF into
 *> the format used in ZSYTRF_RK (or ZSYTRF_BK).
 *>
 *> If parameter WAY = 'R':
@@ -48,7 +48,7 @@
 *> (or ZSYTRF_BK) provided on entry in parameters A and E into
 *> the factorization output format used in ZSYTRF that is stored
 *> on exit in parameter A. It also converts in place details of
-*> the intechanges stored in IPIV from the format used in ZSYTRF_RK
+*> the interchanges stored in IPIV from the format used in ZSYTRF_RK
 *> (or ZSYTRF_BK) into the format used in ZSYTRF.
 *>
 *> ZSYCONVF can also convert in Hermitian matrix case, i.e. between
@@ -325,7 +325,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -469,7 +469,7 @@
                   END IF
 *
 *                 Convert IPIV
-*                 There is no interchnge of rows i and and IPIV(i),
+*                 There is no interchange of rows i and and IPIV(i),
 *                 so this should be reflected in IPIV format for
 *                 *SYTRF_RK ( or *SYTRF_BK)
 *
@@ -535,7 +535,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/zsyconvf_rook.f b/lapack-netlib/SRC/zsyconvf_rook.f
index 3cfa694c3..62cca060b 100644
--- a/lapack-netlib/SRC/zsyconvf_rook.f
+++ b/lapack-netlib/SRC/zsyconvf_rook.f
@@ -520,7 +520,7 @@
 *
 *           Revert VALUE
 *           Assign subdiagonal entries of D from array E to
-*           subgiagonal entries of A.
+*           subdiagonal entries of A.
 *
             I = 1
             DO WHILE ( I.LE.N-1 )
diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f
index 4f19630a9..701d73a38 100644
--- a/lapack-netlib/SRC/zsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/zsysv_aa_2stage.f
@@ -87,7 +87,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zsytrf.f b/lapack-netlib/SRC/zsytrf.f
index a775a8758..dc9016c69 100644
--- a/lapack-netlib/SRC/zsytrf.f
+++ b/lapack-netlib/SRC/zsytrf.f
@@ -232,7 +232,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/zsytrf_aa.f b/lapack-netlib/SRC/zsytrf_aa.f
index 73d257cfe..ddb19925a 100644
--- a/lapack-netlib/SRC/zsytrf_aa.f
+++ b/lapack-netlib/SRC/zsytrf_aa.f
@@ -74,7 +74,7 @@
 *>
 *>          On exit, the tridiagonal matrix is stored in the diagonals
 *>          and the subdiagonals of A just below (or above) the diagonals,
-*>          and L is stored below (or above) the subdiaonals, when UPLO
+*>          and L is stored below (or above) the subdiagonals, when UPLO
 *>          is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f
index b731cb7a2..95b9fda0d 100644
--- a/lapack-netlib/SRC/zsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f
@@ -75,7 +75,7 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>
-*>          On exit, L is stored below (or above) the subdiaonal blocks,
+*>          On exit, L is stored below (or above) the subdiagonal blocks,
 *>          when UPLO  is 'L' (or 'U').
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zsytrf_rk.f b/lapack-netlib/SRC/zsytrf_rk.f
index 3b398ce6c..af8b8d501 100644
--- a/lapack-netlib/SRC/zsytrf_rk.f
+++ b/lapack-netlib/SRC/zsytrf_rk.f
@@ -310,7 +310,7 @@
 *        Determine the block size
 *
          NB = ILAENV( 1, 'ZSYTRF_RK', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
          WORK( 1 ) = LWKOPT
       END IF
 *
diff --git a/lapack-netlib/SRC/ztgevc.f b/lapack-netlib/SRC/ztgevc.f
index 23bd36ddb..6dcbe582f 100644
--- a/lapack-netlib/SRC/ztgevc.f
+++ b/lapack-netlib/SRC/ztgevc.f
@@ -53,7 +53,7 @@
 *>
 *>    S*x = w*P*x,  (y**H)*S = w*(y**H)*P,
 *>
-*> where y**H denotes the conjugate tranpose of y.
+*> where y**H denotes the conjugate transpose of y.
 *> The eigenvalues are not input to this routine, but are computed
 *> directly from the diagonal elements of S and P.
 *>
@@ -154,7 +154,7 @@
 *> \verbatim
 *>          VR is COMPLEX*16 array, dimension (LDVR,MM)
 *>          On entry, if SIDE = 'R' or 'B' and HOWMNY = 'B', VR must
-*>          contain an N-by-N matrix Q (usually the unitary matrix Z
+*>          contain an N-by-N matrix Z (usually the unitary matrix Z
 *>          of right Schur vectors returned by ZHGEQZ).
 *>          On exit, if SIDE = 'R' or 'B', VR contains:
 *>          if HOWMNY = 'A', the matrix X of right eigenvectors of (S,P);
@@ -259,7 +259,7 @@
       EXTERNAL           LSAME, DLAMCH, ZLADIV
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, XERBLA, ZGEMV
+      EXTERNAL           XERBLA, ZGEMV
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCMPLX, DCONJG, DIMAG, MAX, MIN
@@ -367,7 +367,6 @@
 *
       SAFMIN = DLAMCH( 'Safe minimum' )
       BIG = ONE / SAFMIN
-      CALL DLABAD( SAFMIN, BIG )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       SMALL = SAFMIN*N / ULP
       BIG = ONE / SMALL
diff --git a/lapack-netlib/SRC/ztgsy2.f b/lapack-netlib/SRC/ztgsy2.f
index ee26b5e7b..0cae8939e 100644
--- a/lapack-netlib/SRC/ztgsy2.f
+++ b/lapack-netlib/SRC/ztgsy2.f
@@ -57,7 +57,7 @@
 *>        Z = [ kron(In, A)  -kron(B**H, Im) ]             (2)
 *>            [ kron(In, D)  -kron(E**H, Im) ],
 *>
-*> Ik is the identity matrix of size k and X**H is the conjuguate transpose of X.
+*> Ik is the identity matrix of size k and X**H is the conjugate transpose of X.
 *> kron(X, Y) is the Kronecker product between the matrices X and Y.
 *>
 *> If TRANS = 'C', y in the conjugate transposed system Z**H*y = scale*b
diff --git a/lapack-netlib/SRC/ztrevc3.f b/lapack-netlib/SRC/ztrevc3.f
index 6300e80ae..8fb144e0c 100644
--- a/lapack-netlib/SRC/ztrevc3.f
+++ b/lapack-netlib/SRC/ztrevc3.f
@@ -321,9 +321,9 @@
 *
       INFO = 0
       NB = ILAENV( 1, 'ZTREVC', SIDE // HOWMNY, N, -1, -1, -1 )
-      MAXWRK = N + 2*N*NB
+      MAXWRK = MAX( 1, N + 2*N*NB )
       WORK(1) = MAXWRK
-      RWORK(1) = N
+      RWORK(1) = MAX( 1, N )
       LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 )
       IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN
          INFO = -1
diff --git a/lapack-netlib/SRC/ztrexc.f b/lapack-netlib/SRC/ztrexc.f
index b92e63efa..32aae51e4 100644
--- a/lapack-netlib/SRC/ztrexc.f
+++ b/lapack-netlib/SRC/ztrexc.f
@@ -40,7 +40,7 @@
 *>
 *> The Schur form T is reordered by a unitary similarity transformation
 *> Z**H*T*Z, and optionally the matrix Q of Schur vectors is updated by
-*> postmultplying it with Z.
+*> postmultiplying it with Z.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zunbdb1.f b/lapack-netlib/SRC/zunbdb1.f
index 767788a74..2fae170de 100644
--- a/lapack-netlib/SRC/zunbdb1.f
+++ b/lapack-netlib/SRC/zunbdb1.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> ZUNBDB1 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb2.f b/lapack-netlib/SRC/zunbdb2.f
index 46b08aa1e..28e78fc23 100644
--- a/lapack-netlib/SRC/zunbdb2.f
+++ b/lapack-netlib/SRC/zunbdb2.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> ZUNBDB2 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb3.f b/lapack-netlib/SRC/zunbdb3.f
index 74083e41a..9f32a7a88 100644
--- a/lapack-netlib/SRC/zunbdb3.f
+++ b/lapack-netlib/SRC/zunbdb3.f
@@ -37,7 +37,7 @@
 *>\verbatim
 *>
 *> ZUNBDB3 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb4.f b/lapack-netlib/SRC/zunbdb4.f
index 4672cfa67..a1db5eb79 100644
--- a/lapack-netlib/SRC/zunbdb4.f
+++ b/lapack-netlib/SRC/zunbdb4.f
@@ -38,7 +38,7 @@
 *>\verbatim
 *>
 *> ZUNBDB4 simultaneously bidiagonalizes the blocks of a tall and skinny
-*> matrix X with orthonomal columns:
+*> matrix X with orthonormal columns:
 *>
 *>                            [ B11 ]
 *>      [ X11 ]   [ P1 |    ] [  0  ]
diff --git a/lapack-netlib/SRC/zunbdb5.f b/lapack-netlib/SRC/zunbdb5.f
index 23174fe50..c451ae921 100644
--- a/lapack-netlib/SRC/zunbdb5.f
+++ b/lapack-netlib/SRC/zunbdb5.f
@@ -148,7 +148,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup unbdb5
 *
 *  =====================================================================
       SUBROUTINE ZUNBDB5( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -169,18 +169,21 @@
 *  =====================================================================
 *
 *     .. Parameters ..
+      DOUBLE PRECISION   REALZERO
+      PARAMETER          ( REALZERO = 0.0D0 )
       COMPLEX*16         ONE, ZERO
       PARAMETER          ( ONE = (1.0D0,0.0D0), ZERO = (0.0D0,0.0D0) )
 *     ..
 *     .. Local Scalars ..
       INTEGER            CHILDINFO, I, J
+      DOUBLE PRECISION   EPS, NORM, SCL, SSQ
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZUNBDB6, XERBLA
+      EXTERNAL           ZLASSQ, ZUNBDB6, ZSCAL, XERBLA
 *     ..
 *     .. External Functions ..
-      DOUBLE PRECISION   DZNRM2
-      EXTERNAL           DZNRM2
+      DOUBLE PRECISION   DLAMCH, DZNRM2
+      EXTERNAL           DLAMCH, DZNRM2
 *     ..
 *     .. Intrinsic Function ..
       INTRINSIC          MAX
@@ -213,16 +216,33 @@
          RETURN
       END IF
 *
-*     Project X onto the orthogonal complement of Q
+      EPS = DLAMCH( 'Precision' )
 *
-      CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2, LDQ2,
-     $              WORK, LWORK, CHILDINFO )
+*     Project X onto the orthogonal complement of Q if X is nonzero
 *
-*     If the projection is nonzero, then return
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
 *
-      IF( DZNRM2(M1,X1,INCX1) .NE. ZERO
-     $    .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
-         RETURN
+      IF( NORM .GT. N * EPS ) THEN
+*        Scale vector to unit norm to avoid problems in the caller code.
+*        Computing the reciprocal is undesirable but
+*         * xLASCL cannot be used because of the vector increments and
+*         * the round-off error has a negligible impact on
+*           orthogonalization.
+         CALL ZSCAL( M1, ONE / NORM, X1, INCX1 )
+         CALL ZSCAL( M2, ONE / NORM, X2, INCX2 )
+         CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
+     $              LDQ2, WORK, LWORK, CHILDINFO )
+*
+*        If the projection is nonzero, then return
+*
+         IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
+            RETURN
+         END IF
       END IF
 *
 *     Project each standard basis vector e_1,...,e_M1 in turn, stopping
@@ -238,8 +258,8 @@
          END DO
          CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( DZNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
@@ -257,8 +277,8 @@
          X2(I) = ONE
          CALL ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
      $                 LDQ2, WORK, LWORK, CHILDINFO )
-         IF( DZNRM2(M1,X1,INCX1) .NE. ZERO
-     $       .OR. DZNRM2(M2,X2,INCX2) .NE. ZERO ) THEN
+         IF( DZNRM2(M1,X1,INCX1) .NE. REALZERO
+     $       .OR. DZNRM2(M2,X2,INCX2) .NE. REALZERO ) THEN
             RETURN
          END IF
       END DO
diff --git a/lapack-netlib/SRC/zunbdb6.f b/lapack-netlib/SRC/zunbdb6.f
index ed666e449..ddc9dfc61 100644
--- a/lapack-netlib/SRC/zunbdb6.f
+++ b/lapack-netlib/SRC/zunbdb6.f
@@ -41,9 +41,8 @@
 *> with respect to the columns of
 *>      Q = [ Q1 ] .
 *>          [ Q2 ]
-*> The Euclidean norm of X must be one and the columns of Q must be
-*> orthonormal. The orthogonalized vector will be zero if and only if it
-*> lies entirely in the range of Q.
+*> The columns of Q must be orthonormal. The orthogonalized vector will
+*> be zero if and only if it lies entirely in the range of Q.
 *>
 *> The projection is computed with at most two iterations of the
 *> classical Gram-Schmidt algorithm, see
@@ -152,7 +151,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16OTHERcomputational
+*> \ingroup unbdb6
 *
 *  =====================================================================
       SUBROUTINE ZUNBDB6( M1, M2, N, X1, INCX1, X2, INCX2, Q1, LDQ1, Q2,
@@ -174,7 +173,7 @@
 *
 *     .. Parameters ..
       DOUBLE PRECISION   ALPHA, REALONE, REALZERO
-      PARAMETER          ( ALPHA = 0.01D0, REALONE = 1.0D0,
+      PARAMETER          ( ALPHA = 0.83D0, REALONE = 1.0D0,
      $                     REALZERO = 0.0D0 )
       COMPLEX*16         NEGONE, ONE, ZERO
       PARAMETER          ( NEGONE = (-1.0D0,0.0D0), ONE = (1.0D0,0.0D0),
@@ -223,14 +222,16 @@
 *
       EPS = DLAMCH( 'Precision' )
 *
+*     Compute the Euclidean norm of X
+*
+      SCL = REALZERO
+      SSQ = REALZERO
+      CALL ZLASSQ( M1, X1, INCX1, SCL, SSQ )
+      CALL ZLASSQ( M2, X2, INCX2, SCL, SSQ )
+      NORM = SCL * SQRT( SSQ )
+*
 *     First, project X onto the orthogonal complement of Q's column
 *     space
-*
-*     Christoph Conrads: In debugging mode the norm should be computed
-*     and an assertion added comparing the norm with one. Alas, Fortran
-*     never made it into 1989 when assert() was introduced into the C
-*     programming language.
-      NORM = REALONE
 *
       IF( M1 .EQ. 0 ) THEN
          DO I = 1, N
diff --git a/lapack-netlib/SRC/zuncsd2by1.f b/lapack-netlib/SRC/zuncsd2by1.f
index 2d861c2cc..399b598be 100644
--- a/lapack-netlib/SRC/zuncsd2by1.f
+++ b/lapack-netlib/SRC/zuncsd2by1.f
@@ -211,13 +211,13 @@
 *>          LRWORK is INTEGER
 *>          The dimension of the array RWORK.
 *>
-*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          If LRWORK=-1, then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK and RWORK
 *>          arrays, returns this value as the first entry of the WORK
 *>          and RWORK array, respectively, and no error message related
 *>          to LWORK or LRWORK is issued by XERBLA.
 *> \endverbatim
-*
+*>
 *> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (M-MIN(P,M-P,Q,M-Q))
diff --git a/lapack-netlib/SRC/zungtsqr.f b/lapack-netlib/SRC/zungtsqr.f
index 4f2d7dfdd..23e28ac5c 100644
--- a/lapack-netlib/SRC/zungtsqr.f
+++ b/lapack-netlib/SRC/zungtsqr.f
@@ -133,6 +133,7 @@
 *>
 *> \param[in] LWORK
 *> \verbatim
+*>          LWORK is INTEGER
 *>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
 *>          If LWORK = -1, then a workspace query is assumed.
 *>          The routine only calculates the optimal size of the WORK
@@ -302,4 +303,4 @@
 *
 *     End of ZUNGTSQR
 *
-      END
\ No newline at end of file
+      END
diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt
index b4e2223f7..d4e6f970d 100644
--- a/lapack-netlib/TESTING/CMakeLists.txt
+++ b/lapack-netlib/TESTING/CMakeLists.txt
@@ -54,6 +54,9 @@ add_lapack_test(sgqr.out gqr.in xeigtsts)
 add_lapack_test(sgsv.out gsv.in xeigtsts)
 add_lapack_test(scsd.out csd.in xeigtsts)
 add_lapack_test(slse.out lse.in xeigtsts)
+#
+# ======== SINGLE DMD EIG TESTS ===========================
+add_lapack_test(sdmd.out sdmd.in xdmdeigtsts)
 endif()
 
 if(BUILD_DOUBLE)
@@ -85,6 +88,9 @@ add_lapack_test(dgqr.out gqr.in xeigtstd)
 add_lapack_test(dgsv.out gsv.in xeigtstd)
 add_lapack_test(dcsd.out csd.in xeigtstd)
 add_lapack_test(dlse.out lse.in xeigtstd)
+#
+# ======== DOUBLE DMD EIG TESTS ===========================
+add_lapack_test(ddmd.out ddmd.in xdmdeigtstd)
 endif()
 
 if(BUILD_COMPLEX)
@@ -114,6 +120,9 @@ add_lapack_test(cgqr.out gqr.in xeigtstc)
 add_lapack_test(cgsv.out gsv.in xeigtstc)
 add_lapack_test(ccsd.out csd.in xeigtstc)
 add_lapack_test(clse.out lse.in xeigtstc)
+#
+# ======== COMPLEX DMD EIG TESTS ===========================
+add_lapack_test(cdmd.out cdmd.in xdmdeigtstc)
 endif()
 
 if(BUILD_COMPLEX16)
@@ -145,6 +154,9 @@ add_lapack_test(zgqr.out gqr.in xeigtstz)
 add_lapack_test(zgsv.out gsv.in xeigtstz)
 add_lapack_test(zcsd.out csd.in xeigtstz)
 add_lapack_test(zlse.out lse.in xeigtstz)
+#
+# ======== COMPLEX16 DMD EIG TESTS ===========================
+add_lapack_test(zdmd.out zdmd.in xdmdeigtstz)
 endif()
 
 
diff --git a/lapack-netlib/TESTING/EIG/CMakeLists.txt b/lapack-netlib/TESTING/EIG/CMakeLists.txt
index d252c7fa9..e4c4181b2 100644
--- a/lapack-netlib/TESTING/EIG/CMakeLists.txt
+++ b/lapack-netlib/TESTING/EIG/CMakeLists.txt
@@ -42,6 +42,8 @@ set(SEIGTST schkee.F
    sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f
    sstt22.f ssyl01.f ssyt21.f ssyt22.f)
 
+set(SDMDEIGTST schkdmd.f90)
+
 set(CEIGTST cchkee.F
    cbdt01.f cbdt02.f cbdt03.f cbdt05.f
    cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f
@@ -59,6 +61,8 @@ set(CEIGTST cchkee.F
    csgt01.f cslect.f csyl01.f
    cstt21.f cstt22.f cunt01.f cunt03.f)
 
+set(CDMDEIGTST cchkdmd.f90)
+
 set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f
    dsvdch.f dsvdct.f dsxt1.f)
 
@@ -79,6 +83,8 @@ set(DEIGTST dchkee.F
    dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f
    dstt22.f dsyl01.f dsyt21.f dsyt22.f)
 
+set(DDMDEIGTST dchkdmd.f90)
+
 set(ZEIGTST zchkee.F
    zbdt01.f zbdt02.f zbdt03.f zbdt05.f
    zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f
@@ -96,24 +102,36 @@ set(ZEIGTST zchkee.F
    zsgt01.f zslect.f zsyl01.f
    zstt21.f zstt22.f zunt01.f zunt03.f)
 
+set(ZDMDEIGTST zchkdmd.f90)
+
 macro(add_eig_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
+  target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
+
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+	  string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+    target_link_libraries(${name} omp pthread)
+  endif()
+
 #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 
 if(BUILD_SINGLE)
 add_eig_executable(xeigtsts ${SEIGTST} ${SCIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtsts ${SDMDEIGTST})
 endif()
 
 if(BUILD_COMPLEX)
 add_eig_executable(xeigtstc ${CEIGTST} ${SCIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtstc ${CDMDEIGTST})
 endif()
 
 if(BUILD_DOUBLE)
 add_eig_executable(xeigtstd ${DEIGTST} ${DZIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtstd ${DDMDEIGTST})
 endif()
 
 if(BUILD_COMPLEX16)
 add_eig_executable(xeigtstz ${ZEIGTST} ${DZIGTST} ${AEIGTST})
+add_eig_executable(xdmdeigtstz ${ZDMDEIGTST})
 endif()
diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile
index 942ae6982..9cf0fc95e 100644
--- a/lapack-netlib/TESTING/EIG/Makefile
+++ b/lapack-netlib/TESTING/EIG/Makefile
@@ -64,6 +64,8 @@ SEIGTST = schkee.o \
    sort03.o ssbt21.o ssgt01.o sslect.o sspt21.o sstt21.o \
    sstt22.o ssyl01.o ssyt21.o ssyt22.o
 
+SDMDEIGTST = schkdmd.o
+
 CEIGTST = cchkee.o \
    cbdt01.o cbdt02.o cbdt03.o cbdt05.o \
    cchkbb.o cchkbd.o cchkbk.o cchkbl.o cchkec.o \
@@ -81,6 +83,8 @@ CEIGTST = cchkee.o \
    csgt01.o cslect.o csyl01.o\
    cstt21.o cstt22.o cunt01.o cunt03.o
 
+CDMDEIGTST = cchkdmd.o
+
 DZIGTST = dlafts.o dlahd2.o dlasum.o dlatb9.o dstech.o dstect.o \
    dsvdch.o dsvdct.o dsxt1.o
 
@@ -101,6 +105,8 @@ DEIGTST = dchkee.o \
    dort03.o dsbt21.o dsgt01.o dslect.o dspt21.o dstt21.o \
    dstt22.o dsyl01.o dsyt21.o dsyt22.o
 
+DDMDEIGTST = dchkdmd.o
+
 ZEIGTST = zchkee.o \
    zbdt01.o zbdt02.o zbdt03.o zbdt05.o \
    zchkbb.o zchkbd.o zchkbk.o zchkbl.o zchkec.o \
@@ -118,14 +124,28 @@ ZEIGTST = zchkee.o \
    zsgt01.o zslect.o zsyl01.o\
    zstt21.o zstt22.o zunt01.o zunt03.o
 
+ZDMDEIGTST = zchkdmd.o
+
 .PHONY: all
 all: single complex double complex16
 
 .PHONY: single complex double complex16
-single: xeigtsts
-complex: xeigtstc
-double: xeigtstd
-complex16: xeigtstz
+single: xeigtsts xdmdeigtsts
+complex: xeigtstc xdmdeigtstc
+double: xeigtstd xdmdeigtstd
+complex16: xeigtstz xdmdeigtstz
+
+xdmdeigtsts: $(SDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+
+xdmdeigtstc: $(CDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+
+xdmdeigtstd: $(DDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
+
+xdmdeigtstz: $(ZDMDEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
@@ -139,6 +159,10 @@ xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
 	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
+$(SDMDEIGTST): $(FRC)
+$(CDMDEIGTST): $(FRC)
+$(DDMDEIGTST): $(FRC)
+$(ZDMDEIGTST): $(FRC)
 $(AEIGTST): $(FRC)
 $(SCIGTST): $(FRC)
 $(DZIGTST): $(FRC)
@@ -155,7 +179,7 @@ clean: cleanobj cleanexe
 cleanobj:
 	rm -f *.o
 cleanexe:
-	rm -f xeigtst*
+	rm -f xeigtst* xdmdeigtst*
 
 schkee.o: schkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
@@ -165,3 +189,11 @@ cchkee.o: cchkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 zchkee.o: zchkee.F
 	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+schkdmd.o: schkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+cchkdmd.o: cchkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+dchkdmd.o: dchkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
+zchkdmd.o: zchkdmd.f90
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
diff --git a/lapack-netlib/TESTING/EIG/alareq.f b/lapack-netlib/TESTING/EIG/alareq.f
index 2585a686a..2cbe6db38 100644
--- a/lapack-netlib/TESTING/EIG/alareq.f
+++ b/lapack-netlib/TESTING/EIG/alareq.f
@@ -28,12 +28,12 @@
 *> to evaluate the input line which requested NMATS matrix types for
 *> PATH.  The flow of control is as follows:
 *>
-*> If NMATS = NTYPES then
+*> IF NMATS = NTYPES THEN
 *>    DOTYPE(1:NTYPES) = .TRUE.
-*> else
+*> ELSE
 *>    Read the next input line for NMATS matrix types
 *>    Set DOTYPE(I) = .TRUE. for each valid type I
-*> endif
+*> END IF
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/EIG/alarqg.f b/lapack-netlib/TESTING/EIG/alarqg.f
index 6e2e6e7ec..b9fb88c65 100644
--- a/lapack-netlib/TESTING/EIG/alarqg.f
+++ b/lapack-netlib/TESTING/EIG/alarqg.f
@@ -28,12 +28,12 @@
 *> to evaluate the input line which requested NMATS matrix types for
 *> PATH.  The flow of control is as follows:
 *>
-*> If NMATS = NTYPES then
+*> IF NMATS = NTYPES THEN
 *>    DOTYPE(1:NTYPES) = .TRUE.
-*> else
+*> ELSE
 *>    Read the next input line for NMATS matrix types
 *>    Set DOTYPE(I) = .TRUE. for each valid type I
-*> endif
+*> END IF
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/EIG/cchkdmd.f90 b/lapack-netlib/TESTING/EIG/cchkdmd.f90
new file mode 100644
index 000000000..a9c181da9
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/cchkdmd.f90
@@ -0,0 +1,721 @@
+!   This is a test program for checking the implementations of
+!   the implementations of the following subroutines
+!
+!   CGEDMD,  for computation of the
+!            Dynamic Mode Decomposition (DMD)
+!   CGEDMDQ, for computation of a
+!            QR factorization based compressed DMD
+!
+!   Developed and supported by:
+!   ===========================
+!   Developed and coded by Zlatko Drmac, Faculty of Science,
+!   University of Zagreb;  drmac@math.hr
+!   In cooperation with
+!   AIMdyn Inc., Santa Barbara, CA.
+!   ========================================================
+!   How to run the code (compiler, link info)
+!   ========================================================
+!   Compile as FORTRAN 90 (or later) and link with BLAS and
+!   LAPACK libraries.
+!   NOTE: The code is developed and tested on top of the
+!   Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!   using the Intel Fortran compiler.
+!
+!   For developers of the C++ implementation
+!   ========================================================
+!   See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!   Note on a development of the GPU HP implementation
+!   ========================================================
+!   Work in progress. See CUDA, MAGMA, SLATE.
+!   NOTE: The four SVD subroutines used in this code are
+!   included as a part of R&D and for the completeness.
+!   This was also an opportunity to test those SVD codes.
+!   If the scaling option is used all four are essentially
+!   equally good. For implementations on HP platforms,
+!   one can use whichever SVD is available.
+!............................................................
+
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+
+      use iso_fortran_env
+      IMPLICIT NONE
+      integer, parameter :: WP = real32
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+      COMPLEX(KIND=WP), PARAMETER ::  CONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: CZERO = ( 0.0_WP, 0.0_WP )
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   :: RES, &
+                     RES1, RESEX, SINGVX, SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   ::   IWORK
+      REAL(KIND=WP) :: WDUMMY(2)
+      INTEGER       :: IDUMMY(4), ISEED(4)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS,       &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_XW, &
+                       TMP_EX
+!............................................................
+      COMPLEX(KIND=WP) :: CMAX
+      INTEGER :: LCWORK
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::  A, AC,  &
+                                 AU, F, F0, F1, S, W,  &
+                                 X, X0, Y, Y0, Y1, Z, Z1
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::  CDA, CDR, &
+                                       CDL, CEIGS, CEIGSA, CWORK
+      COMPLEX(KIND=WP) ::  CDUMMY(22), CDUM2X2(2,2)
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j,     &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL,  &
+                 NFAIL_Z_XV,  MODE, MODEL, MODER, WHTSVD
+      INTEGER :: iNRNK, iWHTSVD,  K_traj, LWMINOPT
+      CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                   SCALE, RESIDS, WANTQ, WANTR
+      LOGICAL :: TEST_QRDMD
+
+!..... external subroutines (BLAS and LAPACK)
+      EXTERNAL CAXPY, CGEEV, CGEMM, CGEMV, CLASCL
+!.....external subroutines DMD package
+!     subroutines under test
+      EXTERNAL CGEDMD, CGEDMDQ
+!..... external functions (BLAS and LAPACK)
+      EXTERNAL         SCNRM2, SLAMCH
+      REAL(KIND=WP) :: SCNRM2, SLAMCH
+      EXTERNAL         CLANGE
+      REAL(KIND=WP) :: CLANGE
+      EXTERNAL ICAMAX
+      INTEGER  ICAMAX
+      EXTERNAL LSAME
+      LOGICAL  LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX, SIGN
+!............................................................
+
+
+      WRITE(*,*) 'COMPLEX CODE TESTING'
+
+      ! The test is always in pairs : ( CGEDMD and CGEDMDQ)
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      ! This code by default performs tests on CGEDMDQ
+      ! Since the QR factorizations based algorithm is designed for
+      ! single trajectory data, only single trajectory tests will
+      ! be performed with xGEDMDQ.
+
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = SLAMCH( 'P' )  ! machine precision WP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      READ(*,*) M
+      WRITE(*,*) 'M = ', M
+      ! ... and the number of snapshots.
+      READ(*,*) N
+      WRITE(*,*) 'N = ', N
+
+      ! Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA  = M
+      LDF  = M
+      LDX  = M
+      LDY  = M
+      LDW  = N
+      LDZ  = M
+      LDAU = M
+      LDS  = N
+
+      TMP_XW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      ALLOCATE( A(LDA,M) )
+      ALLOCATE( AC(LDA,M) )
+      ALLOCATE( F(LDF,N+1) )
+      ALLOCATE( F0(LDF,N+1) )
+      ALLOCATE( F1(LDF,N+1) )
+      ALLOCATE( X(LDX,N) )
+      ALLOCATE( X0(LDX,N) )
+      ALLOCATE( Y(LDY,N+1) )
+      ALLOCATE( Y0(LDY,N+1) )
+      ALLOCATE( Y1(LDY,N+1) )
+      ALLOCATE( AU(LDAU,N) )
+      ALLOCATE( W(LDW,N) )
+      ALLOCATE( S(LDS,N) )
+      ALLOCATE( Z(LDZ,N) )
+      ALLOCATE( Z1(LDZ,N) )
+      ALLOCATE( RES(N) )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( CEIGS(N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+
+      TOL  = 10*M*EPS
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_traj = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND   = 1.0D4
+      CMAX   = (1.0D1,1.0D1)
+      RSIGN  = 'F'
+      GRADE  = 'N'
+      MODEL  = 6
+      CONDL  = 1.0D1
+      MODER  = 6
+      CONDR  = 1.0D1
+      PIVTNG = 'N'
+      ! Loop over all parameter MODE values for CLATMR (+-1,..,+-6)
+
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE( CDA(M) )
+      ALLOCATE( CDL(M) )
+      ALLOCATE( CDR(M) )
+
+      CALL CLATMR( M, M, 'N', ISEED, 'N', CDA, MODE, COND, &
+                   CMAX, RSIGN, GRADE, CDL, MODEL,  CONDL, &
+                   CDR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO )
+      DEALLOCATE( CDR )
+      DEALLOCATE( CDL )
+      DEALLOCATE( CDA )
+      DEALLOCATE( IWORK )
+
+      LCWORK = MAX(1,2*M)
+      ALLOCATE( CEIGSA(M) )
+      ALLOCATE( CWORK(LCWORK) )
+      ALLOCATE( WORK(2*M) )
+      AC(1:M,1:M) = A(1:M,1:M)
+      CALL CGEEV( 'N','N', M, AC, LDA, CEIGSA, CDUM2X2, 2, &
+                  CDUM2X2, 2, CWORK, LCWORK, WORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      DEALLOCATE(CWORK)
+
+      TMP = ABS(CEIGSA(ICAMAX(M, CEIGSA, 1))) ! The spectral radius of A
+      ! Scale the matrix A to have unit spectral radius.
+      CALL CLASCL( 'G',0, 0, TMP, ONE, M, M, &
+                   A, LDA, INFO )
+      CALL CLASCL( 'G',0, 0, TMP, ONE, M, 1, &
+                   CEIGSA, M, INFO )
+      ANORM = CLANGE( 'F', M, M, A, LDA, WDUMMY )
+
+      IF ( K_traj == 2 ) THEN
+          ! generate data as two trajectories
+          ! with two inital conditions
+          CALL CLARNV(2, ISEED, M, F(1,1) )
+          DO i = 1, N/2
+             CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1,  &
+                  CZERO, F(1,i+1), 1 )
+          END DO
+          X0(1:M,1:N/2) = F(1:M,1:N/2)
+          Y0(1:M,1:N/2) = F(1:M,2:N/2+1)
+
+          CALL CLARNV(2, ISEED, M, F(1,1) )
+          DO i = 1, N-N/2
+             CALL CGEMV( 'N', M, M, CONE, A, LDA, F(1,i), 1,  &
+                  CZERO, F(1,i+1), 1 )
+          END DO
+          X0(1:M,N/2+1:N) = F(1:M,1:N-N/2)
+          Y0(1:M,N/2+1:N) = F(1:M,2:N-N/2+1)
+      ELSE
+          CALL CLARNV(2, ISEED, M, F(1,1) )
+          DO i = 1, N
+             CALL CGEMV( 'N', M, M, CONE, A, M, F(1,i), 1,  &
+                  CZERO, F(1,i+1), 1 )
+          END DO
+          F0(1:M,1:N+1) = F(1:M,1:N+1)
+          X0(1:M,1:N) = F0(1:M,1:N)
+          Y0(1:M,1:N) = F0(1:M,2:N+1)
+      END IF
+
+      DEALLOCATE( CEIGSA )
+!........................................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V'
+              RESIDS = 'R'
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F'
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R'
+          CASE(2)
+              JOBREF = 'E'
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S'
+          CASE(2)
+              SCALE = 'C'
+          CASE(3)
+              SCALE = 'Y'
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+          NRNK   = iNRNK
+
+      DO iWHTSVD = 1,  3
+         ! Check all four options to compute the POD basis
+         ! via the SVD.
+         WHTSVD   = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+         ! Workspace query for the minimal (1) and for the optimal
+         ! (2) workspace lengths determined by workspace query.
+
+      ! CGEDMD is always tested and its results are also used for
+      ! comparisons with CGEDMDQ.
+
+      X(1:M,1:N) = X0(1:M,1:N)
+      Y(1:M,1:N) = Y0(1:M,1:N)
+
+      CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,  &
+                M,  N, X, LDX, Y, LDY, NRNK, TOL,  &
+                K, CEIGS, Z, LDZ,  RES,  &
+                AU, LDAU, W,  LDW,   S, LDS,        &
+                CDUMMY, -1, WDUMMY, -1, IDUMMY, -1, INFO )
+
+      IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) &
+                       .OR. ( INFO < 0 ) ) THEN
+        WRITE(*,*) 'Call to CGEDMD workspace query failed. &
+                   &Check the calling sequence and the code.'
+        WRITE(*,*) 'The error code is ', INFO
+        WRITE(*,*) 'The input parameters were ',      &
+        SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,          &
+        M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS
+        STOP
+      ELSE
+        !WRITE(*,*) '... done. Workspace length computed.'
+      END IF
+
+      LCWORK = INT(CDUMMY(LWMINOPT))
+      ALLOCATE(CWORK(LCWORK))
+      LIWORK = IDUMMY(1)
+      ALLOCATE(IWORK(LIWORK))
+      LWORK = INT(WDUMMY(1))
+      ALLOCATE(WORK(LWORK))
+
+      CALL CGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,  &
+                   M,  N, X, LDX, Y, LDY, NRNK, TOL,  &
+                   K, CEIGS, Z, LDZ,  RES,  &
+                   AU, LDAU, W,  LDW,   S, LDS,        &
+                   CWORK, LCWORK, WORK, LWORK, IWORK, LIWORK, INFO )
+      IF ( INFO /= 0 ) THEN
+           WRITE(*,*) 'Call to CGEDMD failed. &
+           &Check the calling sequence and the code.'
+           WRITE(*,*) 'The error code is ', INFO
+           WRITE(*,*) 'The input parameters were ',&
+           SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+           M, N, LDX, LDY, NRNK, TOL
+           STOP
+      END IF
+      SINGVX(1:N) = WORK(1:N)
+
+      !...... CGEDMD check point
+      IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from CGEDMD
+          ! This checks that the returned eigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the Rayleigh quotient
+          ! returned in W
+          CALL CGEMM( 'N', 'N', M, K, K, CONE, X, LDX, W, LDW, &
+                      CZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL CAXPY( M, -CONE, Z(1,i), 1, Z1(1,i), 1)
+             TMP = MAX(TMP, SCNRM2( M, Z1(1,i), 1 ) )
+          END DO
+          TMP_XW = MAX(TMP_XW, TMP )
+          IF ( TMP_XW <= TOL ) THEN
+              !WRITE(*,*) ' :) .... OK .........CGEDMD PASSED.'
+          ELSE
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+              WRITE(*,*) ':( .................CGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+      END IF
+      !...... CGEDMD check point
+
+      IF ( LSAME(JOBREF,'R') ) THEN
+           ! The matrix A*U is returned for computing refined Ritz vectors.
+           ! Check that A*U is computed correctly using the formula
+           ! A*U = Y * V * inv(SIGMA). This depends on the
+           ! accuracy in the computed singular values and vectors of X.
+           ! See the paper for an error analysis.
+           ! Note that the left singular vectors of the input matrix X
+           ! are returned in the array X.
+           CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, X, LDX, &
+                      CZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL CAXPY( M, -CONE, AU(1,i), 1, Z1(1,i), 1)
+             TMP = MAX( TMP, SCNRM2( M, Z1(1,i),1 ) * &
+                     SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_AU = MAX( TMP_AU, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK .........CGEDMD PASSED.'
+          ELSE
+              NFAIL_AU = NFAIL_AU + 1
+              WRITE(*,*) ':( .................CGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL2
+          END IF
+      ELSEIF ( LSAME(JOBREF,'E') ) THEN
+          ! The unscaled vectors of the Exact DMD are computed.
+          ! This option is included for the sake of completeness,
+          ! for users who prefer the Exact DMD vectors. The
+          ! returned vectors are in the real form, in the same way
+          ! as the Ritz vectors. Here we just save the vectors
+          ! and test them separately using a Matlab script.
+          CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, AU, LDAU, CZERO, Y1, LDY )
+
+          DO i=1, K
+             CALL CAXPY( M, -CEIGS(i), AU(1,i), 1, Y1(1,i), 1 )
+             RESEX(i) = SCNRM2( M, Y1(1,i), 1) / SCNRM2(M,AU(1,i),1)
+          END DO
+      END IF
+      !...... CGEDMD check point
+
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by CGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL CGEMM( 'N', 'N', M, K, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in CGEDMD,)
+
+          DO i=1, K
+                ! have a real eigenvalue with real eigenvector
+                CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 )
+                RES1(i) = SCNRM2( M, Y1(1,i), 1)
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                    SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK ..........CGEDMD PASSED.'
+          ELSE
+              NFAIL_REZ = NFAIL_REZ + 1
+              WRITE(*,*) ':( ..................CGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+
+         IF ( LSAME(JOBREF,'E') ) THEN
+            TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+          END DO
+          TMP_EX = MAX(TMP_EX,TMP)
+         END IF
+
+      END IF
+
+      DEALLOCATE(CWORK)
+      DEALLOCATE(WORK)
+      DEALLOCATE(IWORK)
+
+!.......................................................................................................
+
+      IF ( K_traj == 1 ) THEN
+
+          F(1:M,1:N+1) = F0(1:M,1:N+1)
+          CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                    WHTSVD, M, N+1, F, LDF,  X, LDX,  Y, LDY,  &
+                    NRNK,  TOL, K, CEIGS, Z, LDZ, RES,  AU,  &
+                    LDAU, W, LDW, S, LDS, CDUMMY, -1,   &
+                    WDUMMY,  -1, IDUMMY, -1, INFO )
+
+          LCWORK = INT(CDUMMY(LWMINOPT))
+          ALLOCATE(CWORK(LCWORK))
+          LIWORK = IDUMMY(1)
+          ALLOCATE(IWORK(LIWORK))
+          LWORK = INT(WDUMMY(1))
+          ALLOCATE(WORK(LWORK))
+
+          CALL CGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                        WHTSVD, M, N+1, F, LDF,  X, LDX,  Y, LDY,  &
+                        NRNK,  TOL, KQ, CEIGS, Z, LDZ, RES,  AU,  &
+                        LDAU, W, LDW, S, LDS, CWORK, LCWORK,   &
+                        WORK,  LWORK, IWORK, LIWORK, INFO )
+          IF ( INFO /= 0 ) THEN
+                 WRITE(*,*) 'Call to CGEDMDQ failed. &
+                 &Check the calling sequence and the code.'
+                 WRITE(*,*) 'The error code is ', INFO
+                 WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+                 STOP
+          END IF
+          SINGVQX(1:N) =WORK(1:N)
+
+          !..... ZGEDMDQ check point
+
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+             TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                   SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > TOL2 ) THEN
+               WRITE(*,*) 'FAILED! Something was wrong with the run.'
+             NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+          END IF
+          !..... CGEDMDQ check point
+
+          !..... CGEDMDQ check point
+          IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+             ! Check that the QR factors are computed and returned
+             ! as requested. The residual ||F-Q*R||_F / ||F||_F
+             ! is compared to M*N*EPS.
+             F1(1:M,1:N+1) = F0(1:M,1:N+1)
+             CALL CGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -CONE, F, &
+                         LDF, Y, LDY, CONE, F1, LDF )
+             TMP_FQR = CLANGE( 'F', M, N+1, F1, LDF, WORK ) / &
+                   CLANGE( 'F', M, N+1, F0,  LDF, WORK )
+             IF ( TMP_FQR <= TOL2 ) THEN
+                !WRITE(*,*) ':) CGEDMDQ ........ PASSED.'
+             ELSE
+                WRITE(*,*) ':( CGEDMDQ ........ FAILED.'
+                NFAIL_F_QR = NFAIL_F_QR + 1
+             END IF
+          END IF
+          !..... ZGEDMDQ checkpoint
+                 !..... ZGEDMDQ checkpoint
+          IF ( LSAME(RESIDS, 'R') ) THEN
+              ! Compare the residuals returned by ZGEDMDQ with the
+              ! explicitly computed residuals using the matrix A.
+              ! Compute explicitly Y1 = A*Z
+              CALL CGEMM( 'N', 'N', M, KQ, M, CONE, A, LDA, Z, LDZ, CZERO, Y1, LDY )
+              ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+              ! of the invariant subspaces that correspond to complex conjugate
+              ! pairs of eigencalues. (See the description of Z in ZGEDMDQ)
+              DO i = 1, KQ
+                    ! have a real eigenvalue with real eigenvector
+                    CALL CAXPY( M, -CEIGS(i), Z(1,i), 1, Y1(1,i), 1 )
+                    ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                    RES1(i) = SCNRM2( M, Y1(1,i), 1)
+              END DO
+              TMP = ZERO
+              DO i = 1, KQ
+              TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                  SINGVQX(KQ)/(ANORM*SINGVQX(1)) )
+              END DO
+              TMP_REZQ = MAX( TMP_REZQ, TMP )
+              IF ( TMP <= TOL2 ) THEN
+                  !WRITE(*,*) '.... OK ........ CGEDMDQ PASSED.'
+              ELSE
+                  NFAIL_REZQ = NFAIL_REZQ + 1
+                  WRITE(*,*) '................ CGEDMDQ FAILED!', &
+                      'Check the code for implementation errors.'
+              END IF
+          END IF
+
+          DEALLOCATE(CWORK)
+          DEALLOCATE(WORK)
+          DEALLOCATE(IWORK)
+
+      END IF
+
+      END DO   ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO   ! iWHTSVD
+      !write(*,*) 'WHTSVD loop completed'
+      END DO   ! iNRNK  -2:-1
+      !write(*,*) 'NRNK loop completed'
+      END DO   ! iSCALE  1:4
+      !write(*,*) 'SCALE loop completed'
+      END DO
+      !write(*,*) 'JOBREF loop completed'
+      END DO   ! iJOBZ
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE( A )
+      DEALLOCATE( AC )
+      DEALLOCATE( Z )
+      DEALLOCATE( F )
+      DEALLOCATE( F0 )
+      DEALLOCATE( F1 )
+      DEALLOCATE( X )
+      DEALLOCATE( X0 )
+      DEALLOCATE( Y )
+      DEALLOCATE( Y0 )
+      DEALLOCATE( Y1 )
+      DEALLOCATE( AU )
+      DEALLOCATE( W )
+      DEALLOCATE( S )
+      DEALLOCATE( Z1 )
+      DEALLOCATE( RES )
+      DEALLOCATE( RES1 )
+      DEALLOCATE( RESEX )
+      DEALLOCATE( CEIGS )
+      DEALLOCATE( SINGVX )
+      DEALLOCATE( SINGVQX )
+
+      END DO ! LLOOP
+
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for CGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+          WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+          WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+          WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_XW
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_z_XV
+      END IF
+
+      IF ( NFAIL_AU == 0 ) THEN
+          WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+          WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+          WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+
+      IF ( NFAIL_REZ == 0 ) THEN
+         WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+        WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+        WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+      IF ( NFAIL_TOTAL == 0 ) THEN
+        WRITE(*,*) '>>>> CGEDMD :: ALL TESTS PASSED.'
+      ELSE
+        WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+        WRITE(*,*) '>>>>>>>>>>>>>> CGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for CGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+        WRITE(*,*) '>>>> CGEDMD and CGEDMDQ computed singular &
+           &values test PASSED.'
+      ELSE
+        WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in &
+            &the singular values unacceptable ', &
+            NFAIL_SVDIFF, ' times. Test FAILED.'
+        WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+      IF ( NFAIL_F_QR == 0 ) THEN
+        WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+        WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+        WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+        WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+        WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+        WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+        WRITE(*,*) '>>>>>>> CGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+        WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+        WRITE(*,*) '>>>>>>> CGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/cchkst.f b/lapack-netlib/TESTING/EIG/cchkst.f
index 95747d051..ed535e91f 100644
--- a/lapack-netlib/TESTING/EIG/cchkst.f
+++ b/lapack-netlib/TESTING/EIG/cchkst.f
@@ -364,7 +364,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -665,8 +665,7 @@
       EXTERNAL           CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY,
      $                   CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR,
      $                   CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR, CUPGTR,
-     $                   SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF,
-     $                   XERBLA
+     $                   SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, CONJG, INT, LOG, MAX, MIN, REAL, SQRT
@@ -733,7 +732,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/cchkst2stg.f b/lapack-netlib/TESTING/EIG/cchkst2stg.f
index e4deb8ac8..668b51a36 100644
--- a/lapack-netlib/TESTING/EIG/cchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/cchkst2stg.f
@@ -385,7 +385,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -683,10 +683,10 @@
       EXTERNAL           ILAENV, SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SLABAD, SLASUM, SSTEBZ, SSTECH, SSTERF,
-     $                   XERBLA, CCOPY, CHET21, CHETRD, CHPT21, CHPTRD,
-     $                   CLACPY, CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC,
-     $                   CSTEMR, CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR,
+      EXTERNAL           SCOPY, SLASUM, SSTEBZ, SSTECH, SSTERF, XERBLA,
+     $                   CCOPY, CHET21, CHETRD, CHPT21, CHPTRD, CLACPY,
+     $                   CLASET, CLATMR, CLATMS, CPTEQR, CSTEDC, CSTEMR,
+     $                   CSTEIN, CSTEQR, CSTT21, CSTT22, CUNGTR,
      $                   CUPGTR, CHETRD_2STAGE, SLASET
 *     ..
 *     .. Intrinsic Functions ..
@@ -754,7 +754,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/cdrgsx.f b/lapack-netlib/TESTING/EIG/cdrgsx.f
index c5baeef5e..1729770a2 100644
--- a/lapack-netlib/TESTING/EIG/cdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/cdrgsx.f
@@ -395,7 +395,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, CGESVD, CGET51, CGGESX, CLACPY, CLAKF2,
-     $                   CLASET, CLATM5, SLABAD, XERBLA
+     $                   CLASET, CLATM5, XERBLA
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            FS
@@ -478,7 +478,6 @@
       ULPINV = ONE / ULP
       SMLNUM = SLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -917,7 +916,7 @@
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/cdrgvx.f b/lapack-netlib/TESTING/EIG/cdrgvx.f
index 830a39d86..4114b697f 100644
--- a/lapack-netlib/TESTING/EIG/cdrgvx.f
+++ b/lapack-netlib/TESTING/EIG/cdrgvx.f
@@ -56,7 +56,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/cdrvsg.f b/lapack-netlib/TESTING/EIG/cdrvsg.f
index d15b39d01..729976738 100644
--- a/lapack-netlib/TESTING/EIG/cdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvsg.f
@@ -236,7 +236,7 @@
 *>
 *>  B       COMPLEX array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -420,7 +420,7 @@
 *     .. External Subroutines ..
       EXTERNAL           CHBGV, CHBGVD, CHBGVX, CHEGV, CHEGVD, CHEGVX,
      $                   CHPGV, CHPGVD, CHPGVX, CLACPY, CLASET, CLATMR,
-     $                   CLATMS, CSGT01, SLABAD, SLAFTS, SLASUM, XERBLA
+     $                   CLATMS, CSGT01, SLAFTS, SLASUM, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, REAL, SQRT
@@ -481,7 +481,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f
index 8b8553773..f7d323247 100644
--- a/lapack-netlib/TESTING/EIG/cdrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvsg2stg.f
@@ -242,7 +242,7 @@
 *>
 *>  B       COMPLEX array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -426,7 +426,7 @@
       EXTERNAL           LSAME, SLAMCH, SLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD,
+      EXTERNAL           SLAFTS, SLASUM, XERBLA, CHBGV, CHBGVD,
      $                   CHBGVX, CHEGV, CHEGVD, CHEGVX, CHPGV, CHPGVD,
      $                   CHPGVX, CLACPY, CLASET, CLATMR, CLATMS, CSGT01,
      $                   CHEGV_2STAGE
@@ -490,7 +490,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cdrvst.f b/lapack-netlib/TESTING/EIG/cdrvst.f
index 9c129c0e8..205f06f0d 100644
--- a/lapack-netlib/TESTING/EIG/cdrvst.f
+++ b/lapack-netlib/TESTING/EIG/cdrvst.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -393,8 +393,8 @@
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, CHBEV, CHBEVD, CHBEVX, CHEEV, CHEEVD,
      $                   CHEEVR, CHEEVX, CHET21, CHET22, CHPEV, CHPEVD,
-     $                   CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLABAD,
-     $                   SLAFTS, XERBLA
+     $                   CHPEVX, CLACPY, CLASET, CLATMR, CLATMS, SLAFTS,
+     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, INT, LOG, MAX, MIN, REAL, SQRT
@@ -451,7 +451,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cdrvst2stg.f b/lapack-netlib/TESTING/EIG/cdrvst2stg.f
index 954c7fb87..258f1f370 100644
--- a/lapack-netlib/TESTING/EIG/cdrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/cdrvst2stg.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL             array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by CSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by CSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -391,7 +391,7 @@
       EXTERNAL           SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SLABAD, SLAFTS, XERBLA, CHBEV, CHBEVD,
+      EXTERNAL           ALASVM, SLAFTS, XERBLA, CHBEV, CHBEVD,
      $                   CHBEVX, CHEEV, CHEEVD, CHEEVR, CHEEVX, CHET21,
      $                   CHET22, CHPEV, CHPEVD, CHPEVX, CLACPY, CLASET,
      $                   CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE,
@@ -453,7 +453,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/cerrhs.f b/lapack-netlib/TESTING/EIG/cerrhs.f
index 0568a6d78..2dd86b8c9 100644
--- a/lapack-netlib/TESTING/EIG/cerrhs.f
+++ b/lapack-netlib/TESTING/EIG/cerrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CUNGHR,
-*> CUNMHR, CHSEQR, CHSEIN, and CTREVC.
+*> CERRHS tests the error exits for CGEBAK, CGEBAL, CGEHRD, CGEHD2,
+*> CUNGHR, CUNMHR, CHSEQR, CHSEIN, CTREVC, and CTREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -86,7 +86,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, CGEBAK, CGEBAL, CGEHRD, CHSEIN, CHSEQR,
-     $                   CUNGHR, CUNMHR, CTREVC
+     $                   CUNGHR, CUNMHR, CTREVC, CTREVC3, CGEHD2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL
@@ -193,6 +193,29 @@
          CALL CHKXER( 'CGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        CGEHD2
+*
+         SRNAMT = 'CGEHD2'
+         INFOT = 1
+         CALL CGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'CGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        CUNGHR
 *
          SRNAMT = 'CUNGHR'
@@ -398,6 +421,47 @@
      $                RW, INFO )
          CALL CHKXER( 'CTREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        CTREVC3
+*
+         SRNAMT = 'CTREVC3'
+         INFOT = 1
+         CALL CTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, RW, 2, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL CTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'CTREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 9
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f
index eef34b44b..d23eb14ea 100644
--- a/lapack-netlib/TESTING/EIG/cerrst.f
+++ b/lapack-netlib/TESTING/EIG/cerrst.f
@@ -21,7 +21,7 @@
 *>
 *> \verbatim
 *>
-*> CERRST tests the error exits for CHETRD, CUNGTR, CUNMTR, CHPTRD,
+*> CERRST tests the error exits for CHETRD, CHETD2, CUNGTR, CUNMTR, CHPTRD,
 *> CUNGTR, CUPMTR, CSTEQR, CSTEIN, CPTEQR, CHBTRD,
 *> CHEEV, CHEEVX, CHEEVD, CHBEV, CHBEVX, CHBEVD,
 *> CHPEV, CHPEVX, CHPEVD, and CSTEDC.
@@ -94,7 +94,7 @@
       EXTERNAL           CHBEV, CHBEVD, CHBEVX, CHBTRD, CHEEV, CHEEVD,
      $                   CHEEVR, CHEEVX, CHETRD, CHKXER, CHPEV, CHPEVD,
      $                   CHPEVX, CHPTRD, CPTEQR, CSTEDC, CSTEIN, CSTEQR,
-     $                   CUNGTR, CUNMTR, CUPGTR, CUPMTR,
+     $                   CUNGTR, CUNMTR, CUPGTR, CUPMTR, CHETD2,
      $                   CHEEVD_2STAGE, CHEEVR_2STAGE, CHEEVX_2STAGE,
      $                   CHEEV_2STAGE, CHBEV_2STAGE, CHBEVD_2STAGE,
      $                   CHBEVX_2STAGE, CHETRD_2STAGE, CHETRD_HE2HB,
@@ -156,6 +156,20 @@
          CALL CHKXER( 'CHETRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        CHETD2
+*
+         SRNAMT = 'CHETD2'
+         INFOT = 1
+         CALL CHETD2( '/', 0, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CHETD2( 'U', -1, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CHETD2( 'U', 2, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        CHETRD_2STAGE
 *
          SRNAMT = 'CHETRD_2STAGE'
@@ -628,56 +642,56 @@
          SRNAMT = 'CHEEVX_2STAGE'
          INFOT = 1
          CALL CHEEVX_2STAGE( '/', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 1
          CALL CHEEVX_2STAGE( 'V', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 2
          CALL CHEEVX_2STAGE( 'N', '/', 'U', 0, A, 1,
-     $                0.0D0, 1.0D0, 1, 0, 0.0D0,
+     $                0.0, 1.0, 1, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 3
          CALL CHEEVX_2STAGE( 'N', 'A', '/', 0, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          INFOT = 4
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', -1, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 6
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 2, W, 3, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHEEVX_2STAGE( 'N', 'V', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 9
          CALL CHEEVX_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 1, W, 1, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL CHEEVX_2STAGE( 'N', 'I', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 2, 1, 0.0D0,
+     $                0.0, 0.0, 2, 1, 0.0,
      $                M, X, Z, 2, W, 3, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 15
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 0, W, 3, RW, IW, I3, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 17
          CALL CHEEVX_2STAGE( 'N', 'A', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                0.0, 0.0, 0, 0, 0.0,
      $                M, X, Z, 2, W, 0, RW, IW, I1, INFO )
          CALL CHKXER( 'CHEEVX_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 11
@@ -734,17 +748,17 @@
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL CHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'CHEEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 12
@@ -755,80 +769,80 @@
          N = 1
          INFOT = 1
          CALL CHEEVR_2STAGE( '/', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 1
          CALL CHEEVR_2STAGE( 'V', 'A', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 2
          CALL CHEEVR_2STAGE( 'N', '/', 'U', 0, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 3
          CALL CHEEVR_2STAGE( 'N', 'A', '/', -1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N,
      $                IW( 2*N+1 ), 10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 4
          CALL CHEEVR_2STAGE( 'N', 'A', 'U', -1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N,
      $                IW( 2*N+1 ), 10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 6
          CALL CHEEVR_2STAGE( 'N', 'A', 'U', 2, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHEEVR_2STAGE( 'N', 'V', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 9
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 0, 1, 0.0D0,
+     $                0.0, 0.0, 0, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 10
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 2, A, 2,
-     $                0.0D0, 0.0D0, 2, 1, 0.0D0,
+     $                0.0, 0.0, 2, 1, 0.0,
      $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 15
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
+     $                0.0, 0.0, 1, 1, 0.0,
      $                M, R, Z, 0, IW, Q, 2*N, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                0.0, 0.0, 1, 1, 0.0,
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                0.0, 0.0, 1, 1, 0.0,
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL CHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
-     $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1,
+     $                0.0, 0.0, 1, 1, 0.0,
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'CHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 13
@@ -1259,65 +1273,65 @@
          SRNAMT = 'CHBEVX_2STAGE'
          INFOT = 1
          CALL CHBEVX_2STAGE( '/', 'A', 'U', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          INFOT = 1
          CALL CHBEVX_2STAGE( 'V', 'A', 'U', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 2
          CALL CHBEVX_2STAGE( 'N', '/', 'U', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 1.0D0, 1, 0, 0.0D0,
+     $                       0.0, 1.0, 1, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 3
          CALL CHBEVX_2STAGE( 'N', 'A', '/', 0, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          INFOT = 4
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', -1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 0, -1, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 7
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 1, A, 1, Q, 2,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 2, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
 *         INFOT = 9
 *         CALL CHBEVX_2STAGE( 'V', 'A', 'U', 2, 0, A, 1, Q, 1,
-*     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+*     $                       0.0, 0.0, 0, 0, 0.0,
 *     $                       M, X, Z, 2, W, 0, RW, IW, I3, INFO )
 *         CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL CHBEVX_2STAGE( 'N', 'V', 'U', 1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 12
          CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 13
          CALL CHBEVX_2STAGE( 'N', 'I', 'U', 1, 0, A, 1, Q, 1,
-     $                       0.0D0, 0.0D0, 1, 2, 0.0D0,
+     $                       0.0, 0.0, 1, 2, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 0, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL CHBEVX_2STAGE( 'N', 'A', 'U', 2, 0, A, 1, Q, 2,
-     $                       0.0D0, 0.0D0, 0, 0, 0.0D0,
+     $                       0.0, 0.0, 0, 0, 0.0,
      $                       M, X, Z, 1, W, 0, RW, IW, I3, INFO )
          CALL CHKXER( 'CHBEVX_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 12
diff --git a/lapack-netlib/TESTING/EIG/chkxer.f b/lapack-netlib/TESTING/EIG/chkxer.f
index fd00bb65a..70caf7e0a 100644
--- a/lapack-netlib/TESTING/EIG/chkxer.f
+++ b/lapack-netlib/TESTING/EIG/chkxer.f
@@ -61,7 +61,7 @@
       RETURN
 *
  9999 FORMAT( ' *** Illegal value of parameter number ', I2,
-     $      ' not detected by ', A6, ' ***' )
+     $      ' not detected by ', A, ' ***' )
 *
 *     End of CHKXER
 *
diff --git a/lapack-netlib/TESTING/EIG/csyl01.f b/lapack-netlib/TESTING/EIG/csyl01.f
index 82d790daa..8a3cd1ae5 100644
--- a/lapack-netlib/TESTING/EIG/csyl01.f
+++ b/lapack-netlib/TESTING/EIG/csyl01.f
@@ -120,14 +120,16 @@
       COMPLEX            RMUL
 *     ..
 *     .. Local Arrays ..
-      COMPLEX            A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      COMPLEX            DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) )
-      REAL               SWORK( LDSWORK, 54 ), DUM( MAXN ), VM( 2 )
+      REAL               DUM( MAXN ), VM( 2 )
       INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
 *     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X
+      REAL,    DIMENSION(:,:), ALLOCATABLE :: SWORK
+*     ..
 *     .. External Functions ..
       LOGICAL            SISNAN
       REAL               SLAMCH, CLANGE
@@ -139,6 +141,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, MAX
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -286,6 +302,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/EIG/dchkdmd.f90 b/lapack-netlib/TESTING/EIG/dchkdmd.f90
new file mode 100644
index 000000000..4fbf7531b
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/dchkdmd.f90
@@ -0,0 +1,813 @@
+!     This is a test program for checking the implementations of
+!     the implementations of the following subroutines
+!
+!     DGEDMD  for computation of the
+!             Dynamic Mode Decomposition (DMD)
+!     DGEDMDQ for computation of a
+!             QR factorization based compressed DMD
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     ========================================================
+!     How to run the code (compiler, link info)
+!     ========================================================
+!     Compile as FORTRAN 90 (or later) and link with BLAS and
+!     LAPACK libraries.
+!     NOTE: The code is developed and tested on top of the
+!     Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!     using the Intel Fortran compiler.
+!
+!     For developers of the C++ implementation
+!     ========================================================
+!     See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!     Note on a development of the GPU HP implementation
+!     ========================================================
+!     Work in progress. See CUDA, MAGMA, SLATE.
+!     NOTE: The four SVD subroutines used in this code are
+!     included as a part of R&D and for the completeness.
+!     This was also an opportunity to test those SVD codes.
+!     If the scaling option is used all four are essentially
+!     equally good. For implementations on HP platforms,
+!     one can use whichever SVD is available.
+!...  .........................................................
+!     NOTE:
+!     When using the Intel MKL 2022.0.3 the subroutine xGESVDQ
+!     (optionally used in xGEDMD) may cause access violation
+!     error for x = S, D, C, Z, but only if called with the
+!     work space query. (At least in our Windows 10 MSVS 2019.)
+!     The problem can be mitigated by downloading the source
+!     code of xGESVDQ from the LAPACK repository and use it
+!     localy instead of the one in the MKL. This seems to
+!     indicate that the problem is indeed in the MKL.
+!     This problem did not appear whith Intel MKL 2022.2.0.
+!
+!     NOTE:
+!     xGESDD seems to have a problem with workspace. In some
+!     cases the length of the optimal workspace is returned
+!     smaller than the minimal workspace, as specified in the
+!     code. As a precaution, all optimal workspaces are
+!     set as MAX(minimal, optimal).
+!     Latest implementations of complex xGESDD have different
+!     length of the real worksapce. We use max value over
+!     two versions.
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+      use iso_fortran_env, only: real64
+      IMPLICIT NONE
+      integer, parameter :: WP = real64
+
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::          &
+                     A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,&
+                     Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::          &
+                     DA, DL, DR, REIG, REIGA, REIGQ, IEIG,   &
+                     IEIGA, IEIGQ,  RES, RES1, RESEX, SINGVX,&
+                     SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   ::   IWORK
+      REAL(KIND=WP) :: AB(2,2),   WDUMMY(2)
+      INTEGER       :: IDUMMY(2), ISEED(4), RJOBDATA(8)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_ZXW, &
+                       TMP_EX, XNORM, YNORM
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j, KDIFF,  &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, &
+                 NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD
+      INTEGER    iNRNK, iWHTSVD, K_TRAJ, LWMINOPT
+      CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                       SCALE, RESIDS, WANTQ, WANTR
+
+      LOGICAL  TEST_QRDMD
+!..... external subroutines (BLAS and LAPACK)
+      EXTERNAL DAXPY,  DGEEV, DGEMM, DGEMV, DLACPY, DLASCL
+      EXTERNAL DLARNV, DLATMR
+!.....external subroutines DMD package, part 1
+!     subroutines under test
+      EXTERNAL DGEDMD, DGEDMDQ
+
+!..... external functions (BLAS and LAPACK)
+      EXTERNAL         DLAMCH, DLANGE, DNRM2
+      REAL(KIND=WP) :: DLAMCH, DLANGE, DNRM2
+      EXTERNAL         LSAME
+      LOGICAL          LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX
+!............................................................
+
+      ! The test is always in pairs : ( DGEDMD and DGEDMDQ )
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      TEST_QRDMD = .TRUE. ! This code by default performs tests on DGEDMDQ
+                          ! Since the QR factorizations based algorithm is designed for
+                          ! single trajectory data, only single trajectory tests will
+                          ! be performed with xGEDMDQ.
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = DLAMCH( 'P' )  ! machine precision DP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      KDIFF      = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      WRITE(*,*) 'M = '
+      READ(*,*) M
+      WRITE(*,*) M
+      ! ... and the number of snapshots.
+      WRITE(*,*) 'N = '
+      READ(*,*) N
+      WRITE(*,*) N
+
+      ! ... Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA  = M
+      LDF  = M
+      LDX  = MAX(M,N+1)
+      LDY  = MAX(M,N+1)
+      LDW  = N
+      LDZ  = M
+      LDAU = MAX(M,N+1)
+      LDS  = N
+
+      TMP_ZXW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      !
+      ! Test the subroutines on real data snapshots. All
+      ! computation is done in real arithmetic, even when
+      ! Koopman eigenvalues and modes are real.
+      !
+      ! Allocate memory space
+      ALLOCATE( A(LDA,M) )
+      ALLOCATE( AC(LDA,M) )
+      ALLOCATE( DA(M) )
+      ALLOCATE( DL(M) )
+      ALLOCATE( F(LDF,N+1) )
+      ALLOCATE( F1(LDF,N+1) )
+      ALLOCATE( F2(LDF,N+1) )
+      ALLOCATE( X(LDX,N) )
+      ALLOCATE( X0(LDX,N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+      ALLOCATE( Y(LDY,N+1) )
+      ALLOCATE( Y0(LDY,N+1) )
+      ALLOCATE( Y1(M,N+1) )
+      ALLOCATE( Z(LDZ,N) )
+      ALLOCATE( Z1(LDZ,N) )
+      ALLOCATE( RES(N)  )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( REIG(N) )
+      ALLOCATE( IEIG(N) )
+      ALLOCATE( REIGQ(N) )
+      ALLOCATE( IEIGQ(N) )
+      ALLOCATE( REIGA(M) )
+      ALLOCATE( IEIGA(M) )
+      ALLOCATE( VA(LDA,M) )
+      ALLOCATE( LAMBDA(N,2) )
+      ALLOCATE( LAMBDAQ(N,2) )
+      ALLOCATE( EIGA(M,2) )
+      ALLOCATE( W(LDW,N) )
+      ALLOCATE( AU(LDAU,N) )
+      ALLOCATE( S(N,N) )
+
+      TOL  = M*EPS
+      ! This mimics O(M*N)*EPS bound for accumulated roundoff error.
+      ! The factor 10 is somewhat arbitrary.
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_TRAJ = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND = 1.0D8
+      DMAX = 1.0D2
+      RSIGN = 'F'
+      GRADE = 'N'
+      MODEL = 6
+      CONDL = 1.0D2
+      MODER = 6
+      CONDR = 1.0D2
+      PIVTNG = 'N'
+
+      ! Loop over all parameter MODE values for ZLATMR (+1,..,+6)
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE(DR(N))
+      CALL DLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, &
+                   DMAX, RSIGN, GRADE, DL, MODEL,  CONDL, &
+                   DR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO )
+      DEALLOCATE(IWORK)
+      DEALLOCATE(DR)
+
+      LWORK = 4*M+1
+      ALLOCATE(WORK(LWORK))
+      AC  = A
+      CALL DGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, &
+                  VA, M, WORK, LWORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      TMP = ZERO
+      DO i = 1, M
+         EIGA(i,1) = REIGA(i)
+         EIGA(i,2) = IEIGA(i)
+         TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2))
+      END DO
+
+      ! Scale A to have the desirable spectral radius.
+      CALL DLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO )
+      CALL DLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO )
+
+      ! Compute the norm of A
+      ANORM = DLANGE( 'F', N, N, A, M, WDUMMY )
+
+      IF ( K_TRAJ == 2 ) THEN
+          ! generate data with two inital conditions
+      CALL DLARNV(2, ISEED, M, F1(1,1) )
+      F1(1:M,1) = 1.0E-10*F1(1:M,1)
+      DO i = 1, N/2
+         CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N/2) = F1(1:M,1:N/2)
+      Y0(1:M,1:N/2) = F1(1:M,2:N/2+1)
+
+      CALL DLARNV(2, ISEED, M, F1(1,1) )
+      DO i = 1, N-N/2
+         CALL DGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2)
+      Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1)
+      ELSE
+      CALL DLARNV(2, ISEED, M, F(1,1) )
+      DO i = 1, N
+         CALL DGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, &
+              F(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N) = F(1:M,1:N)
+      Y0(1:M,1:N) = F(1:M,2:N+1)
+      END IF
+
+      XNORM = DLANGE( 'F', M, N, X0, LDX, WDUMMY )
+      YNORM = DLANGE( 'F', M, N, Y0, LDX, WDUMMY )
+!............................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V' ! Ritz vectors will be computed
+              RESIDS = 'R' ! Residuals will be computed
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F' ! Ritz vectors in factored form
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R' ! Data for refined Ritz vectors
+          CASE(2)
+              JOBREF = 'E' ! Exact DMD vectors
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S' ! X data normalized
+          CASE(2)
+              SCALE = 'C' ! X normalized, consist. check
+          CASE(3)
+              SCALE = 'Y' ! Y data normalized
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+          ! Two truncation strategies. The "-2" case for R&D
+          ! purposes only - it uses possibly low accuracy small
+          ! singular values, in which case the formulas used in
+          ! the DMD are highly sensitive.
+          NRNK   = iNRNK
+
+      DO iWHTSVD = 1, 4
+          ! Check all four options to compute the POD basis
+          ! via the SVD.
+          WHTSVD   = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+          ! Workspace query for the minimal (1) and for the optimal
+          ! (2) workspace lengths determined by workspace query.
+
+      X(1:M,1:N) = X0(1:M,1:N)
+      Y(1:M,1:N) = Y0(1:M,1:N)
+
+      ! DGEDMD: Workspace query and workspace allocation
+      CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+           N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+           LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, &
+           IDUMMY, -1, INFO )
+
+      LIWORK = IDUMMY(1)
+      ALLOCATE( IWORK(LIWORK) )
+      LWORK = INT(WDUMMY(LWMINOPT))
+      ALLOCATE( WORK(LWORK) )
+
+      ! DGEDMD test: CALL DGEDMD
+      CALL DGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+           N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+           LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,&
+           IWORK, LIWORK, INFO )
+
+      SINGVX(1:N) = WORK(1:N)
+
+      !...... DGEDMD check point
+      IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from DGEDMD
+          ! This checks that the returned aigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the rayleigh quotient
+          ! returned in W
+          CALL DGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                      ZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL DAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1)
+             TMP = MAX(TMP, DNRM2( M, Z1(1,i), 1 ) )
+          END DO
+          TMP_ZXW = MAX(TMP_ZXW, TMP )
+
+          IF ( TMP_ZXW > 10*M*EPS ) THEN
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+              WRITE(*,*) ':( .................DGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+      END IF
+
+      !...... DGEDMD check point
+      IF ( LSAME(JOBREF,'R') ) THEN
+          ! The matrix A*U is returned for computing refined Ritz vectors.
+          ! Check that A*U is computed correctly using the formula
+          ! A*U = Y * V * inv(SIGMA). This depends on the
+          ! accuracy in the computed singular values and vectors of X.
+          ! See the paper for an error analysis.
+          ! Note that the left singular vectors of the input matrix X
+          ! are returned in the array X.
+          CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, &
+                     ZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+              CALL DAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1)
+              TMP = MAX( TMP, DNRM2( M, Z1(1,i),1 ) * &
+                       SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_AU = MAX( TMP_AU, TMP )
+
+          IF ( TMP > TOL2 ) THEN
+              NFAIL_AU = NFAIL_AU + 1
+              WRITE(*,*) ':( .................DGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+      ELSEIF ( LSAME(JOBREF,'E') ) THEN
+      ! The unscaled vectors of the Exact DMD are computed.
+      ! This option is included for the sake of completeness,
+      ! for users who prefer the Exact DMD vectors. The
+      ! returned vectors are in the real form, in the same way
+      ! as the Ritz vectors. Here we just save the vectors
+      ! and test them separately using a Matlab script.
+
+       CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M )
+       i=1
+       DO WHILE ( i <= K )
+           IF ( IEIG(i) == ZERO ) THEN
+           ! have a real eigenvalue with real eigenvector
+           CALL DAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 )
+           RESEX(i) = DNRM2( M, Y1(1,i), 1) / DNRM2(M,AU(1,i),1)
+           i = i + 1
+           ELSE
+           ! Have a complex conjugate pair
+           ! REIG(i) +- sqrt(-1)*IMEIG(i).
+           ! Since all computation is done in real
+           ! arithmetic, the formula for the residual
+           ! is recast for real representation of the
+           ! complex conjugate eigenpair. See the
+           ! description of RES.
+           AB(1,1) =  REIG(i)
+           AB(2,1) = -IEIG(i)
+           AB(1,2) =  IEIG(i)
+           AB(2,2) =  REIG(i)
+           CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), &
+                       M, AB, 2, ONE, Y1(1,i), M )
+           RESEX(i)   = DLANGE( 'F', M, 2, Y1(1,i), M, &
+                        WORK )/ DLANGE( 'F', M, 2, AU(1,i), M, &
+                        WORK )
+           RESEX(i+1) = RESEX(i)
+           i = i + 2
+           END IF
+       END DO
+
+      END IF
+
+      !...... DGEDMD check point
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by DGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL DGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in DGEDMD,)
+          i = 1
+          DO WHILE ( i <= K )
+              IF ( IEIG(i) == ZERO ) THEN
+                  ! have a real eigenvalue with real eigenvector
+                  CALL DAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 )
+                  RES1(i) = DNRM2( M, Y1(1,i), 1)
+                  i = i + 1
+              ELSE
+                  ! Have a complex conjugate pair
+                  ! REIG(i) +- sqrt(-1)*IMEIG(i).
+                  ! Since all computation is done in real
+                  ! arithmetic, the formula for the residual
+                  ! is recast for real representation of the
+                  ! complex conjugate eigenpair. See the
+                  ! description of RES.
+                  AB(1,1) =  REIG(i)
+                  AB(2,1) = -IEIG(i)
+                  AB(1,2) =  IEIG(i)
+                  AB(2,2) =  REIG(i)
+                  CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                              M, AB, 2, ONE, Y1(1,i), M )
+                  RES1(i)   = DLANGE( 'F', M, 2, Y1(1,i), M, &
+                                     WORK )
+                  RES1(i+1) = RES1(i)
+                  i = i + 2
+              END IF
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+              TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                        SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+
+          IF ( TMP > TOL2 ) THEN
+              NFAIL_REZ = NFAIL_REZ + 1
+              WRITE(*,*) ':( ..................DGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+          IF ( LSAME(JOBREF,'E') ) THEN
+              TMP = ZERO
+              DO i = 1, K
+                  TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+              END DO
+              TMP_EX = MAX(TMP_EX,TMP)
+          END IF
+
+      END IF
+
+      !..... store the results for inspection
+      DO i = 1, K
+          LAMBDA(i,1) = REIG(i)
+          LAMBDA(i,2) = IEIG(i)
+      END DO
+
+      DEALLOCATE(IWORK)
+      DEALLOCATE(WORK)
+
+      !======================================================================
+      !     Now test the DGEDMDQ
+      !======================================================================
+      IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN
+          RJOBDATA(2) = 1
+          F1 = F
+
+          ! DGEDMDQ test: Workspace query and workspace allocation
+          CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS, WDUMMY,      &
+               -1, IDUMMY, -1, INFO )
+          LIWORK = IDUMMY(1)
+          ALLOCATE( IWORK(LIWORK) )
+          LWORK = INT(WDUMMY(LWMINOPT))
+          ALLOCATE(WORK(LWORK))
+          ! DGEDMDQ test: CALL DGEDMDQ
+          CALL DGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS,              &
+               WORK, LWORK, IWORK, LIWORK, INFO )
+
+          SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ)
+
+          !..... DGEDMDQ check point
+          IF ( KQ /= K ) THEN
+              KDIFF = KDIFF+1
+          END IF
+
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+              TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                    SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > M*N*EPS ) THEN
+              WRITE(*,*) 'FAILED! Something was wrong with the run.'
+              NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+              DO j =1, 3
+                  write(*,*) j, SINGVX(j), SINGVQX(j)
+                  read(*,*)
+              END DO
+          END IF
+
+          !..... DGEDMDQ check point
+          IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+              ! Check that the QR factors are computed and returned
+              ! as requested. The residual ||F-Q*R||_F / ||F||_F
+              ! is compared to M*N*EPS.
+              F2 = F
+              CALL DGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, &
+                          LDF, Y, LDY, ONE, F2, LDF )
+              TMP_FQR = DLANGE( 'F', M, N+1, F2, LDF, WORK ) / &
+                    DLANGE( 'F', M, N+1, F,  LDF, WORK )
+              IF ( TMP_FQR > TOL2 ) THEN
+                  WRITE(*,*) 'FAILED! Something was wrong with the run.'
+                  NFAIL_F_QR = NFAIL_F_QR + 1
+              END IF
+          END IF
+
+          !..... DGEDMDQ check point
+          IF ( LSAME(RESIDS, 'R') ) THEN
+              ! Compare the residuals returned by DGEDMDQ with the
+              ! explicitly computed residuals using the matrix A.
+              ! Compute explicitly Y1 = A*Z
+              CALL DGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M )
+              ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+              ! of the invariant subspaces that correspond to complex conjugate
+              ! pairs of eigencalues. (See the description of Z in DGEDMDQ)
+              i = 1
+              DO WHILE ( i <= KQ )
+                  IF ( IEIGQ(i) == ZERO ) THEN
+                      ! have a real eigenvalue with real eigenvector
+                      CALL DAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 )
+                      ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                      RES1(i) = DNRM2( M, Y1(1,i), 1)
+                      i = i + 1
+                  ELSE
+                     ! Have a complex conjugate pair
+                     ! REIG(i) +- sqrt(-1)*IMEIG(i).
+                     ! Since all computation is done in real
+                     ! arithmetic, the formula for the residual
+                     ! is recast for real representation of the
+                     ! complex conjugate eigenpair. See the
+                     ! description of RES.
+                     AB(1,1) =  REIGQ(i)
+                     AB(2,1) = -IEIGQ(i)
+                     AB(1,2) =  IEIGQ(i)
+                     AB(2,2) =  REIGQ(i)
+                     CALL DGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                                 M, AB, 2, ONE, Y1(1,i), M )             ! BLAS CALL
+                     ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+                     RES1(i)   = DLANGE( 'F', M, 2, Y1(1,i), M, &
+                                        WORK )                           ! LAPACK CALL
+                     RES1(i+1) = RES1(i)
+                     i = i + 2
+                  END IF
+              END DO
+              TMP = ZERO
+              DO i = 1, KQ
+                  TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                      SINGVQX(K)/(ANORM*SINGVQX(1)) )
+              END DO
+              TMP_REZQ = MAX( TMP_REZQ, TMP )
+              IF ( TMP > TOL2 ) THEN
+                  NFAIL_REZQ = NFAIL_REZQ + 1
+                  WRITE(*,*) '................ DGEDMDQ FAILED!', &
+                      'Check the code for implementation errors.'
+                  STOP
+              END IF
+
+          END IF
+
+          DO i = 1, KQ
+              LAMBDAQ(i,1) = REIGQ(i)
+              LAMBDAQ(i,2) = IEIGQ(i)
+          END DO
+
+          DEALLOCATE(WORK)
+          DEALLOCATE(IWORK)
+      END IF ! TEST_QRDMD
+!======================================================================
+
+      END DO ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO ! WHTSVD LOOP
+      !write(*,*) 'WHTSVD loop completed'
+      END DO ! NRNK LOOP
+      !write(*,*) 'NRNK loop completed'
+      END DO ! SCALE LOOP
+      !write(*,*) 'SCALE loop completed'
+      END DO ! JOBF LOOP
+      !write(*,*) 'JOBREF loop completed'
+      END DO ! JOBZ LOOP
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE(A)
+      DEALLOCATE(AC)
+      DEALLOCATE(DA)
+      DEALLOCATE(DL)
+      DEALLOCATE(F)
+      DEALLOCATE(F1)
+      DEALLOCATE(F2)
+      DEALLOCATE(X)
+      DEALLOCATE(X0)
+      DEALLOCATE(SINGVX)
+      DEALLOCATE(SINGVQX)
+      DEALLOCATE(Y)
+      DEALLOCATE(Y0)
+      DEALLOCATE(Y1)
+      DEALLOCATE(Z)
+      DEALLOCATE(Z1)
+      DEALLOCATE(RES)
+      DEALLOCATE(RES1)
+      DEALLOCATE(RESEX)
+      DEALLOCATE(REIG)
+      DEALLOCATE(IEIG)
+      DEALLOCATE(REIGQ)
+      DEALLOCATE(IEIGQ)
+      DEALLOCATE(REIGA)
+      DEALLOCATE(IEIGA)
+      DEALLOCATE(VA)
+      DEALLOCATE(LAMBDA)
+      DEALLOCATE(LAMBDAQ)
+      DEALLOCATE(EIGA)
+      DEALLOCATE(W)
+      DEALLOCATE(AU)
+      DEALLOCATE(S)
+
+!............................................................
+      !     Generate random M-by-M matrix A. Use DLATMR from
+      END DO ! LLOOP
+
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for DGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+          WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+          WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+          WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV
+      END IF
+      IF ( NFAIL_AU == 0 ) THEN
+          WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+          WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+          WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+      IF ( NFAIL_REZ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+
+      IF ( NFAIL_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>> DGEDMD :: ALL TESTS PASSED.'
+      ELSE
+          WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+          WRITE(*,*) '>>>>>>>>>>>>>> DGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      IF ( TEST_QRDMD ) THEN
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for DGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+          WRITE(*,*) '>>>> DGEDMD and DGEDMDQ computed singular &
+              &values test PASSED.'
+      ELSE
+          WRITE(*,*) 'DGEDMD and DGEDMDQ discrepancies in &
+              &the singular values unacceptable ', &
+              NFAIL_SVDIFF, ' times. Test FAILED.'
+          WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+
+      IF ( NFAIL_F_QR == 0 ) THEN
+          WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+          WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+          WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>>>>> DGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+         WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+         WRITE(*,*) '>>>>>>> DGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/dchkst.f b/lapack-netlib/TESTING/EIG/dchkst.f
index 2e04f68c5..6e02c84fb 100644
--- a/lapack-netlib/TESTING/EIG/dchkst.f
+++ b/lapack-netlib/TESTING/EIG/dchkst.f
@@ -363,7 +363,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -645,10 +645,10 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR,
-     $                   DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD,
-     $                   DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR,
-     $                   DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA
+      EXTERNAL           DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS,
+     $                   DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ,
+     $                   DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF,
+     $                   DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, INT, LOG, MAX, MIN, SQRT
@@ -715,7 +715,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/dchkst2stg.f b/lapack-netlib/TESTING/EIG/dchkst2stg.f
index 2c98b802d..0fbebc8b7 100644
--- a/lapack-netlib/TESTING/EIG/dchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/dchkst2stg.f
@@ -384,7 +384,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -666,10 +666,10 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLACPY, DLASET, DLASUM, DLATMR,
-     $                   DLATMS, DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD,
-     $                   DSTEBZ, DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR,
-     $                   DSTERF, DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA,
+      EXTERNAL           DCOPY, DLACPY, DLASET, DLASUM, DLATMR, DLATMS,
+     $                   DOPGTR, DORGTR, DPTEQR, DSPT21, DSPTRD, DSTEBZ,
+     $                   DSTECH, DSTEDC, DSTEMR, DSTEIN, DSTEQR, DSTERF,
+     $                   DSTT21, DSTT22, DSYT21, DSYTRD, XERBLA,
      $                   DSYTRD_2STAGE
 *     ..
 *     .. Intrinsic Functions ..
@@ -737,7 +737,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/ddrgsx.f b/lapack-netlib/TESTING/EIG/ddrgsx.f
index b3f5e23f4..82470ead3 100644
--- a/lapack-netlib/TESTING/EIG/ddrgsx.f
+++ b/lapack-netlib/TESTING/EIG/ddrgsx.f
@@ -400,7 +400,7 @@
       EXTERNAL           DLCTSX, ILAENV, DLAMCH, DLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DGESVD, DGET51, DGET53, DGGESX, DLABAD,
+      EXTERNAL           ALASVM, DGESVD, DGET51, DGET53, DGGESX,
      $                   DLACPY, DLAKF2, DLASET, DLATM5, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -478,7 +478,6 @@
       ULPINV = ONE / ULP
       SMLNUM = DLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -984,7 +983,7 @@
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/ddrgvx.f b/lapack-netlib/TESTING/EIG/ddrgvx.f
index c63762134..3de72eb70 100644
--- a/lapack-netlib/TESTING/EIG/ddrgvx.f
+++ b/lapack-netlib/TESTING/EIG/ddrgvx.f
@@ -56,7 +56,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/ddrvsg.f b/lapack-netlib/TESTING/EIG/ddrvsg.f
index 2e9d3c643..72c373086 100644
--- a/lapack-netlib/TESTING/EIG/ddrvsg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvsg.f
@@ -234,7 +234,7 @@
 *>
 *>  B       DOUBLE PRECISION array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -399,7 +399,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
+      EXTERNAL           DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
      $                   DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV,
      $                   DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA
 *     ..
@@ -460,7 +460,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f
index 196c6b48e..0fe31cab1 100644
--- a/lapack-netlib/TESTING/EIG/ddrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvsg2stg.f
@@ -240,7 +240,7 @@
 *>
 *>  B       DOUBLE PRECISION array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -408,7 +408,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
+      EXTERNAL           DLACPY, DLAFTS, DLASET, DLASUM, DLATMR,
      $                   DLATMS, DSBGV, DSBGVD, DSBGVX, DSGT01, DSPGV,
      $                   DSPGVD, DSPGVX, DSYGV, DSYGVD, DSYGVX, XERBLA,
      $                   DSYGV_2STAGE
@@ -470,7 +470,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/ddrvst.f b/lapack-netlib/TESTING/EIG/ddrvst.f
index a25077018..805fd8271 100644
--- a/lapack-netlib/TESTING/EIG/ddrvst.f
+++ b/lapack-netlib/TESTING/EIG/ddrvst.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,11 +502,11 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR,
-     $                   DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD,
-     $                   DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21,
-     $                   DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21,
-     $                   DSYT22, XERBLA
+      EXTERNAL           ALASVM, DLACPY, DLAFTS, DLASET, DLATMR, DLATMS,
+     $                   DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD, DSPEVX,
+     $                   DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21, DSTT22,
+     $                   DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21, DSYT22,
+     $                   XERBLA
 *     ..
 *     .. Scalars in Common ..
       CHARACTER*32       SRNAMT
@@ -574,7 +574,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/ddrvst2stg.f b/lapack-netlib/TESTING/EIG/ddrvst2stg.f
index c9a2632bb..e38671e38 100644
--- a/lapack-netlib/TESTING/EIG/ddrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/ddrvst2stg.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by DSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by DSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,7 +502,7 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLACPY, DLAFTS, DLASET, DLATMR,
+      EXTERNAL           ALASVM, DLACPY, DLAFTS, DLASET, DLATMR,
      $                   DLATMS, DSBEV, DSBEVD, DSBEVX, DSPEV, DSPEVD,
      $                   DSPEVX, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSTT21,
      $                   DSTT22, DSYEV, DSYEVD, DSYEVR, DSYEVX, DSYT21,
@@ -577,7 +577,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/derrhs.f b/lapack-netlib/TESTING/EIG/derrhs.f
index fec41c0f0..583bebc9b 100644
--- a/lapack-netlib/TESTING/EIG/derrhs.f
+++ b/lapack-netlib/TESTING/EIG/derrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> DERRHS tests the error exits for DGEBAK, SGEBAL, SGEHRD, DORGHR,
-*> DORMHR, DHSEQR, SHSEIN, and DTREVC.
+*> DERRHS tests the error exits for DGEBAK, DGEBAL, DGEHRD, DGEHD2,
+*> DORGHR, DORMHR, DHSEQR, DHSEIN, DTREVC, and DTREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -86,7 +86,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, DGEBAK, DGEBAL, DGEHRD, DHSEIN, DHSEQR,
-     $                   DORGHR, DORMHR, DTREVC
+     $                   DORGHR, DORMHR, DTREVC, DTREVC3, DGEHD2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE
@@ -194,6 +194,29 @@
          CALL CHKXER( 'DGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        DGEHD2
+*
+         SRNAMT = 'DGEHD2'
+         INFOT = 1
+         CALL DGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL DGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'DGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        DORGHR
 *
          SRNAMT = 'DORGHR'
@@ -328,7 +351,11 @@
          CALL DHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
      $                INFO )
          CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK )
-         NT = NT + 9
+         INFOT = 13
+         CALL DHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
+     $                INFO )
+         CALL CHKXER( 'DHSEQR', INFOT, NOUT, LERR, OK )
+         NT = NT + 10
 *
 *        DHSEIN
 *
@@ -399,6 +426,43 @@
      $                INFO )
          CALL CHKXER( 'DTREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        DTREVC3
+*
+         SRNAMT = 'DTREVC3'
+         INFOT = 1
+         CALL DTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL DTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL DTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, INFO )
+         CALL CHKXER( 'DTREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 8
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f
index f297e5a7d..7d111e2e0 100644
--- a/lapack-netlib/TESTING/EIG/derrst.f
+++ b/lapack-netlib/TESTING/EIG/derrst.f
@@ -21,10 +21,10 @@
 *>
 *> \verbatim
 *>
-*> DERRST tests the error exits for DSYTRD, DORGTR, DORMTR, DSPTRD,
-*> DOPGTR, DOPMTR, DSTEQR, SSTERF, SSTEBZ, SSTEIN, DPTEQR, DSBTRD,
-*> DSYEV, SSYEVX, SSYEVD, DSBEV, SSBEVX, SSBEVD,
-*> DSPEV, SSPEVX, SSPEVD, DSTEV, SSTEVX, SSTEVD, and SSTEDC.
+*> DERRST tests the error exits for DSYTRD, DSYTD2, DORGTR, DORMTR, DSPTRD,
+*> DOPGTR, DOPMTR, DSTEQR, DSTERF, DSTEBZ, DSTEIN, DPTEQR, DSBTRD,
+*> DSYEV, DSYEVX, DSYEVD, DSBEV, DSBEVX, DSBEVD,
+*> DSPEV, DSPEVX, DSPEVD, DSTEV, DSTEVX, DSTEVD, and DSTEDC.
 *> DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE,
 *> DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE,
 *> DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB,
@@ -95,7 +95,7 @@
      $                   DSBEV, DSBEVD, DSBEVX, DSBTRD, DSPEV, DSPEVD,
      $                   DSPEVX, DSPTRD, DSTEBZ, DSTEDC, DSTEIN, DSTEQR,
      $                   DSTERF, DSTEV, DSTEVD, DSTEVR, DSTEVX, DSYEV,
-     $                   DSYEVD, DSYEVR, DSYEVX, DSYTRD,
+     $                   DSYEVD, DSYEVR, DSYEVX, DSYTRD, DSYTD2,
      $                   DSYEVD_2STAGE, DSYEVR_2STAGE, DSYEVX_2STAGE,
      $                   DSYEV_2STAGE, DSBEV_2STAGE, DSBEVD_2STAGE,
      $                   DSBEVX_2STAGE, DSYTRD_2STAGE, DSYTRD_SY2SB,
@@ -157,6 +157,20 @@
          CALL CHKXER( 'DSYTRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        DSYTD2
+*
+         SRNAMT = 'DSYTD2'
+         INFOT = 1
+         CALL DSYTD2( '/', 0, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        DSYTRD_2STAGE
 *
          SRNAMT = 'DSYTRD_2STAGE'
@@ -721,12 +735,12 @@
          CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N,
+     $                M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N,
      $                INFO )
          CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL DSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0,
      $                INFO )
          CALL CHKXER( 'DSYEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 11
diff --git a/lapack-netlib/TESTING/EIG/dsyl01.f b/lapack-netlib/TESTING/EIG/dsyl01.f
index 782d2cd42..0ea481382 100644
--- a/lapack-netlib/TESTING/EIG/dsyl01.f
+++ b/lapack-netlib/TESTING/EIG/dsyl01.f
@@ -117,13 +117,15 @@
      $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
 *     ..
 *     .. Local Arrays ..
-      DOUBLE PRECISION   A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      DOUBLE PRECISION   DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) ), DUM( MAXN ),
-     $                   SWORK( LDSWORK, 126 ), VM( 2 )
-      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 )
+     $                   VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X,
+     $                   SWORK
 *     ..
 *     .. External Functions ..
       LOGICAL            DISNAN
@@ -136,6 +138,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, MAX
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 126 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -280,6 +296,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/EIG/schkdmd.f90 b/lapack-netlib/TESTING/EIG/schkdmd.f90
new file mode 100644
index 000000000..77e3e46c0
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/schkdmd.f90
@@ -0,0 +1,792 @@
+!     This is a test program for checking the implementations of
+!     the implementations of the following subroutines
+!
+!     SGEDMD  for computation of the
+!             Dynamic Mode Decomposition (DMD)
+!     SGEDMDQ for computation of a
+!             QR factorization based compressed DMD
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     ========================================================
+!     How to run the code (compiler, link info)
+!     ========================================================
+!     Compile as FORTRAN 90 (or later) and link with BLAS and
+!     LAPACK libraries.
+!     NOTE: The code is developed and tested on top of the
+!     Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!     using the Intel Fortran compiler.
+!
+!     For developers of the C++ implementation
+!     ========================================================
+!     See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!     Note on a development of the GPU HP implementation
+!     ========================================================
+!     Work in progress. See CUDA, MAGMA, SLATE.
+!     NOTE: The four SVD subroutines used in this code are
+!     included as a part of R&D and for the completeness.
+!     This was also an opportunity to test those SVD codes.
+!     If the scaling option is used all four are essentially
+!     equally good. For implementations on HP platforms,
+!     one can use whichever SVD is available.
+!...  .........................................................
+!     NOTE:
+!     When using the Intel MKL 2022.0.3 the subroutine xGESVDQ
+!     (optionally used in xGEDMD) may cause access violation
+!     error for x = S, D, C, Z, but only if called with the
+!     work space query. (At least in our Windows 10 MSVS 2019.)
+!     The problem can be mitigated by downloading the source
+!     code of xGESVDQ from the LAPACK repository and use it
+!     localy instead of the one in the MKL. This seems to
+!     indicate that the problem is indeed in the MKL.
+!     This problem did not appear whith Intel MKL 2022.2.0.
+!
+!     NOTE:
+!     xGESDD seems to have a problem with workspace. In some
+!     cases the length of the optimal workspace is returned
+!     smaller than the minimal workspace, as specified in the
+!     code. As a precaution, all optimal workspaces are
+!     set as MAX(minimal, optimal).
+!     Latest implementations of complex xGESDD have different
+!     length of the real worksapce. We use max value over
+!     two versions.
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+      use iso_fortran_env, only: real32
+      IMPLICIT NONE
+      integer, parameter :: WP = real32
+
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::          &
+                     A, AC, EIGA, LAMBDA, LAMBDAQ, F, F1, F2,&
+                     Z, Z1, S, AU, W, VA, X, X0, Y, Y0, Y1
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::          &
+                     DA, DL, DR, REIG, REIGA, REIGQ, IEIG,   &
+                     IEIGA, IEIGQ,  RES, RES1, RESEX, SINGVX,&
+                     SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   ::   IWORK
+      REAL(KIND=WP) :: AB(2,2),   WDUMMY(2)
+      INTEGER       :: IDUMMY(2), ISEED(4), RJOBDATA(8)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, DMAX, EPS, &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_ZXW, &
+                       TMP_EX, XNORM, YNORM
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, L, LLOOP, NRNK
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, KDIFF,     &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL, &
+                 NFAIL_Z_XV, MODE, MODEL, MODER, WHTSVD
+      INTEGER    iNRNK, iWHTSVD, K_TRAJ, LWMINOPT
+      CHARACTER(LEN=1) GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                       SCALE, RESIDS, WANTQ, WANTR
+
+      LOGICAL          TEST_QRDMD
+!..... external subroutines (BLAS and LAPACK)
+      EXTERNAL SAXPY,  SGEEV, SGEMM, SGEMV, SLACPY, SLASCL
+      EXTERNAL SLARNV, SLATMR
+!.....external subroutines DMD package, part 1
+!     subroutines under test
+      EXTERNAL SGEDMD, SGEDMDQ
+
+!..... external functions (BLAS and LAPACK)
+      EXTERNAL         SLAMCH, SLANGE, SNRM2
+      REAL(KIND=WP) :: SLAMCH, SLANGE, SNRM2
+      EXTERNAL         LSAME
+      LOGICAL          LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX
+!............................................................
+
+      ! The test is always in pairs : ( SGEDMD and SGEDMDQ )
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      TEST_QRDMD = .TRUE. ! This code by default performs tests on SGEDMDQ
+                          ! Since the QR factorizations based algorithm is designed for
+                          ! single trajectory data, only single trajectory tests will
+                          ! be performed with xGEDMDQ.
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = SLAMCH( 'P' )  ! machine precision SP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      KDIFF      = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      WRITE(*,*) 'M = '
+      READ(*,*) M
+      WRITE(*,*) M
+      ! ... and the number of snapshots.
+      WRITE(*,*) 'N = '
+      READ(*,*) N
+      WRITE(*,*) N
+
+      ! ... Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA = M
+      LDF = M
+      LDX = MAX(M,N+1)
+      LDY = MAX(M,N+1)
+      LDW = N
+      LDZ = M
+      LDAU = MAX(M,N+1)
+      LDS = N
+
+      TMP_ZXW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      !
+      ! Test the subroutines on real data snapshots. All
+      ! computation is done in real arithmetic, even when
+      ! Koopman eigenvalues and modes are real.
+      !
+      ! Allocate memory space
+      ALLOCATE( A(LDA,M) )
+      ALLOCATE( AC(LDA,M) )
+      ALLOCATE( DA(M) )
+      ALLOCATE( DL(M) )
+      ALLOCATE( F(LDF,N+1) )
+      ALLOCATE( F1(LDF,N+1) )
+      ALLOCATE( F2(LDF,N+1) )
+      ALLOCATE( X(LDX,N) )
+      ALLOCATE( X0(LDX,N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+      ALLOCATE( Y(LDY,N+1) )
+      ALLOCATE( Y0(LDY,N+1) )
+      ALLOCATE( Y1(M,N+1) )
+      ALLOCATE( Z(LDZ,N) )
+      ALLOCATE( Z1(LDZ,N) )
+      ALLOCATE( RES(N)  )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( REIG(N) )
+      ALLOCATE( IEIG(N) )
+      ALLOCATE( REIGQ(N) )
+      ALLOCATE( IEIGQ(N) )
+      ALLOCATE( REIGA(M) )
+      ALLOCATE( IEIGA(M) )
+      ALLOCATE( VA(LDA,M) )
+      ALLOCATE( LAMBDA(N,2) )
+      ALLOCATE( LAMBDAQ(N,2) )
+      ALLOCATE( EIGA(M,2) )
+      ALLOCATE( W(LDW,N) )
+      ALLOCATE( AU(LDAU,N) )
+      ALLOCATE( S(N,N) )
+
+      TOL  = M*EPS
+      ! This mimics O(M*N)*EPS bound for accumulated roundoff error.
+      ! The factor 10 is somewhat arbitrary.
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_TRAJ = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND = 1.0D8
+      DMAX = 1.0D2
+      RSIGN = 'F'
+      GRADE = 'N'
+      MODEL = 6
+      CONDL = 1.0D2
+      MODER = 6
+      CONDR = 1.0D2
+      PIVTNG = 'N'
+
+      ! Loop over all parameter MODE values for ZLATMR (+1,..,+6)
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE(DR(N))
+      CALL SLATMR( M, M, 'S', ISEED, 'N', DA, MODE, COND, &
+                   DMAX, RSIGN, GRADE, DL, MODEL,  CONDL, &
+                   DR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', A, LDA, IWORK(M+1), INFO )
+      DEALLOCATE(IWORK)
+      DEALLOCATE(DR)
+
+      LWORK = 4*M+1
+      ALLOCATE(WORK(LWORK))
+      AC  = A
+      CALL SGEEV( 'N','V', M, AC, M, REIGA, IEIGA, VA, M, &
+                  VA, M, WORK, LWORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      TMP = ZERO
+      DO i = 1, M
+          EIGA(i,1) = REIGA(i)
+          EIGA(i,2) = IEIGA(i)
+          TMP = MAX( TMP, SQRT(REIGA(i)**2+IEIGA(i)**2))
+      END DO
+
+      ! Scale A to have the desirable spectral radius.
+      CALL SLASCL( 'G', 0, 0, TMP, ONE, M, M, A, M, INFO )
+      CALL SLASCL( 'G', 0, 0, TMP, ONE, M, 2, EIGA, M, INFO )
+
+      ! Compute the norm of A
+      ANORM = SLANGE( 'F', N, N, A, M, WDUMMY )
+
+      IF ( K_TRAJ == 2 ) THEN
+          ! generate data with two inital conditions
+      CALL SLARNV(2, ISEED, M, F1(1,1) )
+      F1(1:M,1) = 1.0E-10*F1(1:M,1)
+      DO i = 1, N/2
+         CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N/2) = F1(1:M,1:N/2)
+      Y0(1:M,1:N/2) = F1(1:M,2:N/2+1)
+
+      CALL SLARNV(2, ISEED, M, F1(1,1) )
+      DO i = 1, N-N/2
+         CALL SGEMV( 'N', M, M, ONE, A, M, F1(1,i), 1, ZERO, &
+              F1(1,i+1), 1 )
+      END DO
+      X0(1:M,N/2+1:N) = F1(1:M,1:N-N/2)
+      Y0(1:M,N/2+1:N) = F1(1:M,2:N-N/2+1)
+      ELSE
+          ! single trajectory
+      CALL SLARNV(2, ISEED, M, F(1,1) )
+      DO i = 1, N
+         CALL SGEMV( 'N', M, M, ONE, A, M, F(1,i), 1, ZERO, &
+              F(1,i+1), 1 )
+      END DO
+      X0(1:M,1:N) = F(1:M,1:N)
+      Y0(1:M,1:N) = F(1:M,2:N+1)
+      END IF
+
+      XNORM = SLANGE( 'F', M, N, X0, LDX, WDUMMY )
+      YNORM = SLANGE( 'F', M, N, Y0, LDX, WDUMMY )
+!............................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V' ! Ritz vectors will be computed
+              RESIDS = 'R' ! Residuals will be computed
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F' ! Ritz vectors in factored form
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R' ! Data for refined Ritz vectors
+          CASE(2)
+              JOBREF = 'E' ! Exact DMD vectors
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S' ! X data normalized
+          CASE(2)
+              SCALE = 'C' ! X normalized, consist. check
+          CASE(3)
+              SCALE = 'Y' ! Y data normalized
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+          ! Two truncation strategies. The "-2" case for R&D
+          ! purposes only - it uses possibly low accuracy small
+          ! singular values, in which case the formulas used in
+          ! the DMD are highly sensitive.
+          NRNK   = iNRNK
+
+      DO iWHTSVD = 1, 4
+          ! Check all four options to compute the POD basis
+          ! via the SVD.
+          WHTSVD   = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+          ! Workspace query for the minimal (1) and for the optimal
+          ! (2) workspace lengths determined by workspace query.
+
+       X(1:M,1:N) = X0(1:M,1:N)
+       Y(1:M,1:N) = Y0(1:M,1:N)
+
+       ! SGEDMD: Workspace query and workspace allocation
+       CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+            N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+            LDZ, RES, AU, LDAU, W, LDW, S, LDS, WDUMMY, -1, &
+            IDUMMY, -1, INFO )
+
+       LIWORK = IDUMMY(1)
+       ALLOCATE( IWORK(LIWORK) )
+       LWORK = INT(WDUMMY(LWMINOPT))
+       ALLOCATE( WORK(LWORK) )
+
+       ! SGEDMD test: CALL SGEDMD
+       CALL SGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, M, &
+            N, X, LDX, Y, LDY, NRNK, TOL, K, REIG, IEIG, Z, &
+            LDZ, RES, AU, LDAU, W, LDW, S, LDS, WORK, LWORK,&
+            IWORK, LIWORK, INFO )
+
+       SINGVX(1:N) = WORK(1:N)
+
+       !...... SGEDMD check point
+       IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from SGEDMD
+          ! This checks that the returned aigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the rayleigh quotient
+          ! returned in W
+          CALL SGEMM( 'N', 'N', M, K, K, ONE, X, LDX, W, LDW, &
+                      ZERO, Z1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL SAXPY( M, -ONE, Z(1,i), 1, Z1(1,i), 1)
+             TMP = MAX(TMP, SNRM2( M, Z1(1,i), 1 ) )
+          END DO
+          TMP_ZXW = MAX(TMP_ZXW, TMP )
+
+          IF ( TMP_ZXW > 10*M*EPS ) THEN
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+          END IF
+
+       END IF
+
+       !...... SGEDMD check point
+       IF ( LSAME(JOBREF,'R') ) THEN
+           ! The matrix A*U is returned for computing refined Ritz vectors.
+           ! Check that A*U is computed correctly using the formula
+           ! A*U = Y * V * inv(SIGMA). This depends on the
+           ! accuracy in the computed singular values and vectors of X.
+           ! See the paper for an error analysis.
+           ! Note that the left singular vectors of the input matrix X
+           ! are returned in the array X.
+           CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, X, LDX, &
+                      ZERO, Z1, LDZ )
+           TMP = ZERO
+           DO i = 1, K
+              CALL SAXPY( M, -ONE, AU(1,i), 1, Z1(1,i), 1)
+              TMP = MAX( TMP, SNRM2( M, Z1(1,i),1 ) * &
+                       SINGVX(K)/(ANORM*SINGVX(1)) )
+           END DO
+           TMP_AU = MAX( TMP_AU, TMP )
+
+           IF ( TMP > TOL2 ) THEN
+               NFAIL_AU = NFAIL_AU + 1
+           END IF
+
+       ELSEIF ( LSAME(JOBREF,'E') ) THEN
+       ! The unscaled vectors of the Exact DMD are computed.
+       ! This option is included for the sake of completeness,
+       ! for users who prefer the Exact DMD vectors. The
+       ! returned vectors are in the real form, in the same way
+       ! as the Ritz vectors. Here we just save the vectors
+       ! and test them separately using a Matlab script.
+
+       CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, AU, LDAU, ZERO, Y1, M )
+       i=1
+       DO WHILE ( i <= K )
+       IF ( IEIG(i) == ZERO ) THEN
+        ! have a real eigenvalue with real eigenvector
+        CALL SAXPY( M, -REIG(i), AU(1,i), 1, Y1(1,i), 1 )
+        RESEX(i) = SNRM2( M, Y1(1,i), 1) / SNRM2(M,AU(1,i),1)
+        i = i + 1
+       ELSE
+       ! Have a complex conjugate pair
+       ! REIG(i) +- sqrt(-1)*IMEIG(i).
+       ! Since all computation is done in real
+       ! arithmetic, the formula for the residual
+       ! is recast for real representation of the
+       ! complex conjugate eigenpair. See the
+       ! description of RES.
+       AB(1,1) =  REIG(i)
+       AB(2,1) = -IEIG(i)
+       AB(1,2) =  IEIG(i)
+       AB(2,2) =  REIG(i)
+       CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, AU(1,i), &
+                   M, AB, 2, ONE, Y1(1,i), M )
+       RESEX(i)   = SLANGE( 'F', M, 2, Y1(1,i), M, &
+                    WORK )/ SLANGE( 'F', M, 2, AU(1,i), M, &
+                    WORK )
+       RESEX(i+1) = RESEX(i)
+       i = i + 2
+       END IF
+       END DO
+
+       END IF
+
+      !...... SGEDMD check point
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by SGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL SGEMM( 'N', 'N', M, K, M, ONE, A, LDA, Z, LDZ, ZERO, Y1, M )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in SGEDMD,)
+          i = 1
+          DO WHILE ( i <= K )
+            IF ( IEIG(i) == ZERO ) THEN
+                ! have a real eigenvalue with real eigenvector
+                CALL SAXPY( M, -REIG(i), Z(1,i), 1, Y1(1,i), 1 )
+                RES1(i) = SNRM2( M, Y1(1,i), 1)
+                i = i + 1
+            ELSE
+               ! Have a complex conjugate pair
+               ! REIG(i) +- sqrt(-1)*IMEIG(i).
+               ! Since all computation is done in real
+               ! arithmetic, the formula for the residual
+               ! is recast for real representation of the
+               ! complex conjugate eigenpair. See the
+               ! description of RES.
+               AB(1,1) =  REIG(i)
+               AB(2,1) = -IEIG(i)
+               AB(1,2) =  IEIG(i)
+               AB(2,2) =  REIG(i)
+               CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                           M, AB, 2, ONE, Y1(1,i), M )
+               RES1(i)   = SLANGE( 'F', M, 2, Y1(1,i), M, &
+                                  WORK )
+               RES1(i+1) = RES1(i)
+               i = i + 2
+            END IF
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                    SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+
+          IF ( TMP > TOL2 ) THEN
+              NFAIL_REZ = NFAIL_REZ + 1
+          END IF
+
+         IF ( LSAME(JOBREF,'E') ) THEN
+            TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+          END DO
+          TMP_EX = MAX(TMP_EX,TMP)
+         END IF
+
+      END IF
+
+      ! ... store the results for inspection
+      DO i = 1, K
+          LAMBDA(i,1) = REIG(i)
+          LAMBDA(i,2) = IEIG(i)
+      END DO
+
+      DEALLOCATE(IWORK)
+      DEALLOCATE(WORK)
+
+      !======================================================================
+      !     Now test the SGEDMDQ, if requested.
+      !======================================================================
+      IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN
+          RJOBDATA(2) = 1
+          F1 = F
+
+          ! SGEDMDQ test: Workspace query and workspace allocation
+          CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS, WDUMMY,      &
+               -1, IDUMMY, -1, INFO )
+          LIWORK = IDUMMY(1)
+          ALLOCATE( IWORK(LIWORK) )
+          LWORK = INT(WDUMMY(LWMINOPT))
+          ALLOCATE(WORK(LWORK))
+
+          ! SGEDMDQ test: CALL SGEDMDQ
+          CALL SGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, &
+               JOBREF, WHTSVD, M, N+1, F1, LDF, X, LDX, Y, &
+               LDY, NRNK, TOL, KQ, REIGQ, IEIGQ, Z, LDZ,   &
+               RES, AU, LDAU, W, LDW, S, LDS,              &
+               WORK, LWORK, IWORK, LIWORK, INFO )
+
+          SINGVQX(1:KQ) = WORK(MIN(M,N+1)+1: MIN(M,N+1)+KQ)
+
+          !..... SGEDMDQ check point
+          IF ( KQ /= K ) THEN
+             KDIFF = KDIFF+1
+          END IF
+
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+             TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                   SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > M*N*EPS ) THEN
+             NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+          END IF
+
+          !..... SGEDMDQ check point
+          IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+             ! Check that the QR factors are computed and returned
+             ! as requested. The residual ||F-Q*R||_F / ||F||_F
+             ! is compared to M*N*EPS.
+             F2 = F
+             CALL SGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ONE, F1, &
+                         LDF, Y, LDY, ONE, F2, LDF )
+             TMP_FQR = SLANGE( 'F', M, N+1, F2, LDF, WORK ) / &
+                   SLANGE( 'F', M, N+1, F,  LDF, WORK )
+             IF ( TMP_FQR > TOL2 ) THEN
+                 NFAIL_F_QR = NFAIL_F_QR + 1
+             END IF
+          END IF
+
+          !..... SGEDMDQ checkpoint
+          IF ( LSAME(RESIDS, 'R') ) THEN
+              ! Compare the residuals returned by SGEDMDQ with the
+              ! explicitly computed residuals using the matrix A.
+              ! Compute explicitly Y1 = A*Z
+              CALL SGEMM( 'N', 'N', M, KQ, M, ONE, A, M, Z, M, ZERO, Y1, M )
+              ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+              ! of the invariant subspaces that correspond to complex conjugate
+              ! pairs of eigencalues. (See the description of Z in SGEDMDQ)
+              i = 1
+              DO WHILE ( i <= KQ )
+                IF ( IEIGQ(i) == ZERO ) THEN
+                    ! have a real eigenvalue with real eigenvector
+                    CALL SAXPY( M, -REIGQ(i), Z(1,i), 1, Y1(1,i), 1 )
+                    ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                    RES1(i) = SNRM2( M, Y1(1,i), 1)
+                    i = i + 1
+                ELSE
+                   ! Have a complex conjugate pair
+                   ! REIG(i) +- sqrt(-1)*IMEIG(i).
+                   ! Since all computation is done in real
+                   ! arithmetic, the formula for the residual
+                   ! is recast for real representation of the
+                   ! complex conjugate eigenpair. See the
+                   ! description of RES.
+                   AB(1,1) =  REIGQ(i)
+                   AB(2,1) = -IEIGQ(i)
+                   AB(1,2) =  IEIGQ(i)
+                   AB(2,2) =  REIGQ(i)
+                   CALL SGEMM( 'N', 'N', M, 2, 2, -ONE, Z(1,i), &
+                               M, AB, 2, ONE, Y1(1,i), M )             ! BLAS CALL
+                   ! Y(1:M,i:i+1) = Y(1:M,i:i+1) - Z(1:M,i:i+1) * AB   ! INTRINSIC
+                   RES1(i)   = SLANGE( 'F', M, 2, Y1(1,i), M, &
+                                      WORK )                           ! LAPACK CALL
+                   RES1(i+1) = RES1(i)
+                   i = i + 2
+                END IF
+              END DO
+              TMP = ZERO
+              DO i = 1, KQ
+              TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                  SINGVQX(K)/(ANORM*SINGVQX(1)) )
+              END DO
+              TMP_REZQ = MAX( TMP_REZQ, TMP )
+              IF ( TMP > TOL2 ) THEN
+                  NFAIL_REZQ = NFAIL_REZQ + 1
+              END IF
+
+          END IF
+
+          DO i = 1, KQ
+              LAMBDAQ(i,1) = REIGQ(i)
+              LAMBDAQ(i,2) = IEIGQ(i)
+          END DO
+
+      DEALLOCATE(WORK)
+      DEALLOCATE(IWORK)
+      END IF            ! TEST_QRDMD
+!======================================================================
+
+      END DO ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO ! WHTSVD LOOP
+      !write(*,*) 'WHTSVD loop completed'
+      END DO ! NRNK LOOP
+      !write(*,*) 'NRNK loop completed'
+      END DO ! SCALE LOOP
+      !write(*,*) 'SCALE loop completed'
+      END DO ! JOBF LOOP
+      !write(*,*) 'JOBREF loop completed'
+      END DO ! JOBZ LOOP
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE(A)
+      DEALLOCATE(AC)
+      DEALLOCATE(DA)
+      DEALLOCATE(DL)
+      DEALLOCATE(F)
+      DEALLOCATE(F1)
+      DEALLOCATE(F2)
+      DEALLOCATE(X)
+      DEALLOCATE(X0)
+      DEALLOCATE(SINGVX)
+      DEALLOCATE(SINGVQX)
+      DEALLOCATE(Y)
+      DEALLOCATE(Y0)
+      DEALLOCATE(Y1)
+      DEALLOCATE(Z)
+      DEALLOCATE(Z1)
+      DEALLOCATE(RES)
+      DEALLOCATE(RES1)
+      DEALLOCATE(RESEX)
+      DEALLOCATE(REIG)
+      DEALLOCATE(IEIG)
+      DEALLOCATE(REIGQ)
+      DEALLOCATE(IEIGQ)
+      DEALLOCATE(REIGA)
+      DEALLOCATE(IEIGA)
+      DEALLOCATE(VA)
+      DEALLOCATE(LAMBDA)
+      DEALLOCATE(LAMBDAQ)
+      DEALLOCATE(EIGA)
+      DEALLOCATE(W)
+      DEALLOCATE(AU)
+      DEALLOCATE(S)
+
+!............................................................
+      !     Generate random M-by-M matrix A. Use DLATMR from
+      END DO ! LLOOP
+
+
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for SGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+          WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+          WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+          WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV
+      END IF
+      IF ( NFAIL_AU == 0 ) THEN
+          WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+          WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+          WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+      IF ( NFAIL_REZ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+
+      IF ( NFAIL_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>> SGEDMD :: ALL TESTS PASSED.'
+      ELSE
+          WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+          WRITE(*,*) '>>>>>>>>>>>>>> SGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      IF ( TEST_QRDMD ) THEN
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for SGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+          WRITE(*,*) '>>>> SGEDMD and SGEDMDQ computed singular &
+              &values test PASSED.'
+      ELSE
+          WRITE(*,*) 'SGEDMD and SGEDMDQ discrepancies in &
+              &the singular values unacceptable ', &
+              NFAIL_SVDIFF, ' times. Test FAILED.'
+          WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+
+      IF ( NFAIL_F_QR == 0 ) THEN
+          WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+          WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+          WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>>>>> SGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+         WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+         WRITE(*,*) '>>>>>>> SGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/schkst.f b/lapack-netlib/TESTING/EIG/schkst.f
index 10622d77a..aecbdfe93 100644
--- a/lapack-netlib/TESTING/EIG/schkst.f
+++ b/lapack-netlib/TESTING/EIG/schkst.f
@@ -363,7 +363,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -645,10 +645,10 @@
       EXTERNAL           ILAENV, SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR,
-     $                   SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD,
-     $                   SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR,
-     $                   SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA
+      EXTERNAL           SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS,
+     $                   SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ,
+     $                   SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF,
+     $                   SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, INT, LOG, MAX, MIN, REAL, SQRT
@@ -715,7 +715,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f
index ac5a3fc39..49899a660 100644
--- a/lapack-netlib/TESTING/EIG/schkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/schkst2stg.f
@@ -384,7 +384,7 @@
 *> \verbatim
 *>          D1 is REAL array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -666,10 +666,10 @@
       EXTERNAL           ILAENV, SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SCOPY, SLABAD, SLACPY, SLASET, SLASUM, SLATMR,
-     $                   SLATMS, SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD,
-     $                   SSTEBZ, SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR,
-     $                   SSTERF, SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA,
+      EXTERNAL           SCOPY, SLACPY, SLASET, SLASUM, SLATMR, SLATMS,
+     $                   SOPGTR, SORGTR, SPTEQR, SSPT21, SSPTRD, SSTEBZ,
+     $                   SSTECH, SSTEDC, SSTEMR, SSTEIN, SSTEQR, SSTERF,
+     $                   SSTT21, SSTT22, SSYT21, SSYTRD, XERBLA,
      $                   SSYTRD_2STAGE
 *     ..
 *     .. Intrinsic Functions ..
@@ -737,7 +737,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/sdrgsx.f b/lapack-netlib/TESTING/EIG/sdrgsx.f
index ce0d59214..2015cac1c 100644
--- a/lapack-netlib/TESTING/EIG/sdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/sdrgsx.f
@@ -400,7 +400,7 @@
       EXTERNAL           SLCTSX, ILAENV, SLAMCH, SLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SGESVD, SGET51, SGET53, SGGESX, SLABAD,
+      EXTERNAL           ALASVM, SGESVD, SGET51, SGET53, SGGESX,
      $                   SLACPY, SLAKF2, SLASET, SLATM5, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@@ -479,7 +479,6 @@ c        MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 )
       ULPINV = ONE / ULP
       SMLNUM = SLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -985,7 +984,7 @@ c        MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 )
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/sdrgvx.f b/lapack-netlib/TESTING/EIG/sdrgvx.f
index f6da720ad..91803f60a 100644
--- a/lapack-netlib/TESTING/EIG/sdrgvx.f
+++ b/lapack-netlib/TESTING/EIG/sdrgvx.f
@@ -57,7 +57,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/sdrvsg.f b/lapack-netlib/TESTING/EIG/sdrvsg.f
index 877579bcd..0c82b6f49 100644
--- a/lapack-netlib/TESTING/EIG/sdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvsg.f
@@ -234,7 +234,7 @@
 *>
 *>  B       REAL array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -399,7 +399,7 @@
       EXTERNAL           LSAME, SLAMCH, SLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
+      EXTERNAL           SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
      $                   SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV,
      $                   SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA
 *     ..
@@ -460,7 +460,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f
index ebd169977..38ed61628 100644
--- a/lapack-netlib/TESTING/EIG/sdrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvsg2stg.f
@@ -240,7 +240,7 @@
 *>
 *>  B       REAL             array, dimension (LDB , max(NN))
 *>          Used to hold the symmetric positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -408,7 +408,7 @@
       EXTERNAL           LSAME, SLAMCH, SLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLABAD, SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
+      EXTERNAL           SLACPY, SLAFTS, SLASET, SLASUM, SLATMR,
      $                   SLATMS, SSBGV, SSBGVD, SSBGVX, SSGT01, SSPGV,
      $                   SSPGVD, SSPGVX, SSYGV, SSYGVD, SSYGVX, XERBLA,
      $                   SSYGV_2STAGE
@@ -470,7 +470,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/sdrvst.f b/lapack-netlib/TESTING/EIG/sdrvst.f
index ea0cf66f9..be6d33cee 100644
--- a/lapack-netlib/TESTING/EIG/sdrvst.f
+++ b/lapack-netlib/TESTING/EIG/sdrvst.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,11 +502,11 @@
       EXTERNAL           SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR,
-     $                   SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD,
-     $                   SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21,
-     $                   SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21,
-     $                   SSYT22, XERBLA
+      EXTERNAL           ALASVM, SLACPY, SLAFTS, SLASET, SLATMR, SLATMS,
+     $                   SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD, SSPEVX,
+     $                   SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21, SSTT22,
+     $                   SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21, SSYT22,
+     $                   XERBLA
 *     ..
 *     .. Scalars in Common ..
       CHARACTER*32       SRNAMT
@@ -574,7 +574,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/sdrvst2stg.f b/lapack-netlib/TESTING/EIG/sdrvst2stg.f
index a13a58b48..e05ec0749 100644
--- a/lapack-netlib/TESTING/EIG/sdrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/sdrvst2stg.f
@@ -214,7 +214,7 @@
 *>          Not modified.
 *>
 *>  D1      REAL             array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by SSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by SSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -502,7 +502,7 @@
       EXTERNAL           SLAMCH, SLARND, SSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, SLABAD, SLACPY, SLAFTS, SLASET, SLATMR,
+      EXTERNAL           ALASVM, SLACPY, SLAFTS, SLASET, SLATMR,
      $                   SLATMS, SSBEV, SSBEVD, SSBEVX, SSPEV, SSPEVD,
      $                   SSPEVX, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSTT21,
      $                   SSTT22, SSYEV, SSYEVD, SSYEVR, SSYEVX, SSYT21,
@@ -577,7 +577,6 @@
 *
       UNFL = SLAMCH( 'Safe minimum' )
       OVFL = SLAMCH( 'Overflow' )
-      CALL SLABAD( UNFL, OVFL )
       ULP = SLAMCH( 'Epsilon' )*SLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/serrhs.f b/lapack-netlib/TESTING/EIG/serrhs.f
index 8f0ff98a1..89b7303cd 100644
--- a/lapack-netlib/TESTING/EIG/serrhs.f
+++ b/lapack-netlib/TESTING/EIG/serrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SORGHR,
-*> SORMHR, SHSEQR, SHSEIN, and STREVC.
+*> SERRHS tests the error exits for SGEBAK, SGEBAL, SGEHRD, SGEHD2,
+*> SORGHR, SORMHR, SHSEQR, SHSEIN, STREVC, and STREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -85,7 +85,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, SGEBAK, SGEBAL, SGEHRD, SHSEIN, SHSEQR,
-     $                   SORGHR, SORMHR, STREVC
+     $                   SORGHR, SORMHR, STREVC, STREVC3, SGEHD2
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          REAL
@@ -193,6 +193,29 @@
          CALL CHKXER( 'SGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        SGEHD2
+*
+         SRNAMT = 'SGEHD2'
+         INFOT = 1
+         CALL SGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'SGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        SORGHR
 *
          SRNAMT = 'SORGHR'
@@ -327,7 +350,11 @@
          CALL SHSEQR( 'E', 'V', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
      $                INFO )
          CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK )
-         NT = NT + 9
+         INFOT = 13
+         CALL SHSEQR( 'E', 'N', 2, 1, 2, A, 2, WR, WI, C, 1, W, 1,
+     $                INFO )
+         CALL CHKXER( 'SHSEQR', INFOT, NOUT, LERR, OK )
+         NT = NT + 10
 *
 *        SHSEIN
 *
@@ -398,6 +425,43 @@
      $                INFO )
          CALL CHKXER( 'STREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        STREVC3
+*
+         SRNAMT = 'STREVC3'
+         INFOT = 1
+         CALL STREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL STREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL STREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL STREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL STREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, INFO )
+         CALL CHKXER( 'STREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 8
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f
index afb6d4faf..408346382 100644
--- a/lapack-netlib/TESTING/EIG/serrst.f
+++ b/lapack-netlib/TESTING/EIG/serrst.f
@@ -21,7 +21,7 @@
 *>
 *> \verbatim
 *>
-*> SERRST tests the error exits for SSYTRD, SORGTR, SORMTR, SSPTRD,
+*> SERRST tests the error exits for SSYTRD, SSYTD2, SORGTR, SORMTR, SSPTRD,
 *> SOPGTR, SOPMTR, SSTEQR, SSTERF, SSTEBZ, SSTEIN, SPTEQR, SSBTRD,
 *> SSYEV, SSYEVX, SSYEVD, SSBEV, SSBEVX, SSBEVD,
 *> SSPEV, SSPEVX, SSPEVD, SSTEV, SSTEVX, SSTEVD, and SSTEDC.
@@ -95,7 +95,7 @@
      $                   SSBEV, SSBEVD, SSBEVX, SSBTRD, SSPEV, SSPEVD,
      $                   SSPEVX, SSPTRD, SSTEBZ, SSTEDC, SSTEIN, SSTEQR,
      $                   SSTERF, SSTEV, SSTEVD, SSTEVR, SSTEVX, SSYEV,
-     $                   SSYEVD, SSYEVR, SSYEVX, SSYTRD,
+     $                   SSYEVD, SSYEVR, SSYEVX, SSYTRD, SSYTD2,
      $                   SSYEVD_2STAGE, SSYEVR_2STAGE, SSYEVX_2STAGE,
      $                   SSYEV_2STAGE, SSBEV_2STAGE, SSBEVD_2STAGE,
      $                   SSBEVX_2STAGE, SSYTRD_2STAGE, SSYTRD_SY2SB,
@@ -157,6 +157,20 @@
          CALL CHKXER( 'SSYTRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        SSYTD2
+*
+         SRNAMT = 'SSYTD2'
+         INFOT = 1
+         CALL SSYTD2( '/', 0, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        SSYTRD_2STAGE
 *
          SRNAMT = 'SSYTRD_2STAGE'
@@ -719,12 +733,12 @@
          CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N-1, IW( 2*N+1 ), 10*N,
+     $                M, R, Z, 1, IW, Q, 0, IW( 2*N+1 ), 10*N,
      $                INFO )
          CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL SSYEVR( 'V', 'I', 'U', 1, A, 1, 0.0E0, 0.0E0, 1, 1, 0.0,
-     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, IW( 2*N+1 ), 0,
      $                INFO )
          CALL CHKXER( 'SSYEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 11
diff --git a/lapack-netlib/TESTING/EIG/ssyl01.f b/lapack-netlib/TESTING/EIG/ssyl01.f
index 22d089dc8..fda30a3c0 100644
--- a/lapack-netlib/TESTING/EIG/ssyl01.f
+++ b/lapack-netlib/TESTING/EIG/ssyl01.f
@@ -117,13 +117,15 @@
      $                   SCALE, SCALE3, SMLNUM, TNRM, XNRM
 *     ..
 *     .. Local Arrays ..
-      REAL               A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      REAL               DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) ), DUM( MAXN ),
-     $                   SWORK( LDSWORK, 54 ), VM( 2 )
-      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 ), IDUM( 2 )
+     $                   VM( 2 )
+      INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
+*     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X,
+     $                   SWORK
 *     ..
 *     .. External Functions ..
       LOGICAL            SISNAN
@@ -136,6 +138,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, MAX
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 54 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -280,6 +296,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/EIG/zchkdmd.f90 b/lapack-netlib/TESTING/EIG/zchkdmd.f90
new file mode 100644
index 000000000..873d956c4
--- /dev/null
+++ b/lapack-netlib/TESTING/EIG/zchkdmd.f90
@@ -0,0 +1,745 @@
+!     This is a test program for checking the implementations of
+!     the implementations of the following subroutines
+!
+!     ZGEDMD,  for computation of the
+!              Dynamic Mode Decomposition (DMD)
+!     ZGEDMDQ, for computation of a
+!              QR factorization based compressed DMD
+!
+!     Developed and supported by:
+!     ===========================
+!     Developed and coded by Zlatko Drmac, Faculty of Science,
+!     University of Zagreb;  drmac@math.hr
+!     In cooperation with
+!     AIMdyn Inc., Santa Barbara, CA.
+!     ========================================================
+!     How to run the code (compiler, link info)
+!     ========================================================
+!     Compile as FORTRAN 90 (or later) and link with BLAS and
+!     LAPACK libraries.
+!     NOTE: The code is developed and tested on top of the
+!     Intel MKL library (versions 2022.0.3 and 2022.2.0),
+!     using the Intel Fortran compiler.
+!
+!     For developers of the C++ implementation
+!     ========================================================
+!     See the LAPACK++ and Template Numerical Toolkit (TNT)
+!
+!     Note on a development of the GPU HP implementation
+!     ========================================================
+!     Work in progress. See CUDA, MAGMA, SLATE.
+!     NOTE: The four SVD subroutines used in this code are
+!     included as a part of R&D and for the completeness.
+!     This was also an opportunity to test those SVD codes.
+!     If the scaling option is used all four are essentially
+!     equally good. For implementations on HP platforms,
+!     one can use whichever SVD is available.
+!............................................................
+
+!............................................................
+!............................................................
+!
+      PROGRAM DMD_TEST
+      use iso_fortran_env, only: real64
+      IMPLICIT NONE
+      integer, parameter :: WP = real64
+
+!............................................................
+      REAL(KIND=WP), PARAMETER ::  ONE = 1.0_WP
+      REAL(KIND=WP), PARAMETER :: ZERO = 0.0_WP
+
+      COMPLEX(KIND=WP), PARAMETER ::  ZONE = ( 1.0_WP, 0.0_WP )
+      COMPLEX(KIND=WP), PARAMETER :: ZZERO = ( 0.0_WP, 0.0_WP )
+!............................................................
+      REAL(KIND=WP), ALLOCATABLE, DIMENSION(:)   :: RES, &
+                     RES1, RESEX, SINGVX, SINGVQX, WORK
+      INTEGER      , ALLOCATABLE, DIMENSION(:)   :: IWORK
+      REAL(KIND=WP) :: WDUMMY(2)
+      INTEGER       :: IDUMMY(4), ISEED(4)
+      REAL(KIND=WP) :: ANORM, COND, CONDL, CONDR, EPS,       &
+                       TOL, TOL2, SVDIFF, TMP, TMP_AU,       &
+                       TMP_FQR, TMP_REZ, TMP_REZQ,  TMP_ZXW, &
+                       TMP_EX
+
+!............................................................
+      COMPLEX(KIND=WP) :: ZMAX
+      INTEGER :: LZWORK
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:,:) ::  ZA, ZAC,  &
+                                 ZAU, ZF, ZF0, ZF1, ZS, ZW,       &
+                                 ZX, ZX0, ZY, ZY0, ZY1, ZZ, ZZ1
+      COMPLEX(KIND=WP), ALLOCATABLE, DIMENSION(:)   ::  ZDA, ZDR, &
+                                       ZDL, ZEIGS, ZEIGSA, ZWORK
+      COMPLEX(KIND=WP) ::  ZDUMMY(22), ZDUM2X2(2,2)
+!............................................................
+      INTEGER :: K, KQ, LDF, LDS, LDA, LDAU, LDW, LDX, LDY,  &
+                 LDZ, LIWORK, LWORK, M, N, LLOOP, NRNK, NRNKsp
+      INTEGER :: i, iJOBREF, iJOBZ, iSCALE, INFO, j,     &
+                 NFAIL, NFAIL_AU, NFAIL_F_QR, NFAIL_REZ,     &
+                 NFAIL_REZQ, NFAIL_SVDIFF, NFAIL_TOTAL, NFAILQ_TOTAL,  &
+                 NFAIL_Z_XV,  MODE, MODEL, MODER, WHTSVD,     &
+                 WHTSVDsp
+      INTEGER :: iNRNK, iWHTSVD,  K_TRAJ, LWMINOPT
+      CHARACTER :: GRADE, JOBREF, JOBZ, PIVTNG, RSIGN,   &
+                       SCALE, RESIDS, WANTQ, WANTR
+      LOGICAL :: TEST_QRDMD
+
+!.....external subroutines (BLAS and LAPACK)
+      EXTERNAL DAXPY,  DGEEV, DGEMM, DGEMV, DLACPY, DLASCL
+      EXTERNAL ZGEEV,  ZGEMV, ZLASCL
+      EXTERNAL ZLARNV, ZLATMR
+      EXTERNAL ZAXPY,  ZGEMM
+!.....external subroutines DMD package, part 1
+!     subroutines under test
+      EXTERNAL ZGEDMD, ZGEDMDQ
+!.....external functions (BLAS and LAPACK)
+      EXTERNAL         DLAMCH,  DZNRM2
+      REAL(KIND=WP) :: DLAMCH,  DZNRM2
+      REAL(KIND=WP) ::          ZLANGE
+      EXTERNAL IZAMAX
+      INTEGER  IZAMAX
+      EXTERNAL LSAME
+      LOGICAL  LSAME
+
+      INTRINSIC ABS, INT, MIN, MAX, SIGN
+!............................................................
+
+      ! The test is always in pairs : ( ZGEDMD and ZGEDMDQ )
+      ! because the test includes comparing the results (in pairs).
+!.....................................................................................
+      TEST_QRDMD = .TRUE. ! This code by default performs tests on ZGEDMDQ
+                          ! Since the QR factorizations based algorithm is designed for
+                          ! single trajectory data, only single trajectory tests will
+                          ! be performed with xGEDMDQ.
+      WANTQ = 'Q'
+      WANTR = 'R'
+!.................................................................................
+
+      EPS = DLAMCH( 'P' )  ! machine precision DP
+
+      ! Global counters of failures of some particular tests
+      NFAIL      = 0
+      NFAIL_REZ  = 0
+      NFAIL_REZQ = 0
+      NFAIL_Z_XV = 0
+      NFAIL_F_QR = 0
+      NFAIL_AU   = 0
+      NFAIL_SVDIFF = 0
+      NFAIL_TOTAL  = 0
+      NFAILQ_TOTAL = 0
+
+      DO LLOOP = 1, 4
+
+      WRITE(*,*) 'L Loop Index = ', LLOOP
+
+      ! Set the dimensions of the problem ...
+      WRITE(*,*) 'M = '
+      READ(*,*) M
+      WRITE(*,*) M
+      ! ... and the number of snapshots.
+      WRITE(*,*) 'N = '
+      READ(*,*) N
+      WRITE(*,*) N
+
+      ! ... Test the dimensions
+      IF ( ( MIN(M,N) == 0 ) .OR. ( M < N )  ) THEN
+          WRITE(*,*) 'Bad dimensions. Required: M >= N > 0.'
+          STOP
+      END IF
+!.............
+      ! The seed inside the LLOOP so that each pass can be reproduced easily.
+      ISEED(1) = 4
+      ISEED(2) = 3
+      ISEED(3) = 2
+      ISEED(4) = 1
+
+      LDA  = M
+      LDF  = M
+      LDX  = M
+      LDY  = M
+      LDW  = N
+      LDZ  = M
+      LDAU = M
+      LDS  = N
+
+      TMP_ZXW  = ZERO
+      TMP_AU   = ZERO
+      TMP_REZ  = ZERO
+      TMP_REZQ = ZERO
+      SVDIFF   = ZERO
+      TMP_EX   = ZERO
+
+      ALLOCATE( ZA(LDA,M) )
+      ALLOCATE( ZAC(LDA,M) )
+      ALLOCATE( ZF(LDF,N+1) )
+      ALLOCATE( ZF0(LDF,N+1) )
+      ALLOCATE( ZF1(LDF,N+1) )
+      ALLOCATE( ZX(LDX,N) )
+      ALLOCATE( ZX0(LDX,N) )
+      ALLOCATE( ZY(LDY,N+1) )
+      ALLOCATE( ZY0(LDY,N+1) )
+      ALLOCATE( ZY1(LDY,N+1) )
+      ALLOCATE( ZAU(LDAU,N) )
+      ALLOCATE( ZW(LDW,N) )
+      ALLOCATE( ZS(LDS,N) )
+      ALLOCATE( ZZ(LDZ,N) )
+      ALLOCATE( ZZ1(LDZ,N) )
+      ALLOCATE( RES(N) )
+      ALLOCATE( RES1(N) )
+      ALLOCATE( RESEX(N) )
+      ALLOCATE( ZEIGS(N) )
+      ALLOCATE( SINGVX(N) )
+      ALLOCATE( SINGVQX(N) )
+
+      TOL  = M*EPS
+      ! This mimics O(M*N)*EPS bound for accumulated roundoff error.
+      ! The factor 10 is somewhat arbitrary.
+      TOL2 = 10*M*N*EPS
+
+!.............
+
+      DO K_TRAJ = 1, 2
+      !  Number of intial conditions in the simulation/trajectories (1 or 2)
+
+      COND = 1.0D4
+      ZMAX = (1.0D1,1.0D1)
+      RSIGN = 'F'
+      GRADE = 'N'
+      MODEL = 6
+      CONDL = 1.0D1
+      MODER = 6
+      CONDR = 1.0D1
+      PIVTNG = 'N'
+
+      ! Loop over all parameter MODE values for ZLATMR (+1,..,+6)
+      DO MODE = 1, 6
+
+      ALLOCATE( IWORK(2*M) )
+      ALLOCATE( ZDA(M) )
+      ALLOCATE( ZDL(M) )
+      ALLOCATE( ZDR(M) )
+
+      CALL ZLATMR( M, M, 'N', ISEED, 'N', ZDA, MODE, COND, &
+                   ZMAX, RSIGN, GRADE, ZDL, MODEL,  CONDL, &
+                   ZDR, MODER, CONDR, PIVTNG, IWORK, M, M, &
+                   ZERO, -ONE, 'N', ZA, LDA, IWORK(M+1), INFO )
+      DEALLOCATE( ZDR )
+      DEALLOCATE( ZDL )
+      DEALLOCATE( ZDA )
+      DEALLOCATE( IWORK )
+
+      LZWORK = MAX(1,2*M)
+      ALLOCATE( ZEIGSA(M) )
+      ALLOCATE( ZWORK(LZWORK) )
+      ALLOCATE( WORK(2*M) )
+      ZAC(1:M,1:M) = ZA(1:M,1:M)
+      CALL ZGEEV( 'N','N', M, ZAC, LDA, ZEIGSA, ZDUM2X2, 2, &
+                  ZDUM2X2, 2, ZWORK, LZWORK, WORK, INFO ) ! LAPACK CALL
+      DEALLOCATE(WORK)
+      DEALLOCATE(ZWORK)
+
+      TMP = ABS(ZEIGSA(IZAMAX(M, ZEIGSA, 1))) ! The spectral radius of ZA
+      ! Scale the matrix ZA to have unit spectral radius.
+      CALL ZLASCL( 'G',0, 0, TMP, ONE, M, M, &
+                   ZA, LDA, INFO )
+      CALL ZLASCL( 'G',0, 0, TMP, ONE, M, 1, &
+                   ZEIGSA, M, INFO )
+      ANORM = ZLANGE( 'F', M, M, ZA, LDA, WDUMMY )
+
+      IF ( K_TRAJ == 2 ) THEN
+          ! generate data as two trajectories
+          ! with two inital conditions
+          CALL ZLARNV(2, ISEED, M, ZF(1,1) )
+          DO i = 1, N/2
+             CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1,  &
+                  ZZERO, ZF(1,i+1), 1 )
+          END DO
+          ZX0(1:M,1:N/2) = ZF(1:M,1:N/2)
+          ZY0(1:M,1:N/2) = ZF(1:M,2:N/2+1)
+
+          CALL ZLARNV(2, ISEED, M, ZF(1,1) )
+          DO i = 1, N-N/2
+             CALL ZGEMV( 'N', M, M, ZONE, ZA, LDA, ZF(1,i), 1,  &
+                  ZZERO, ZF(1,i+1), 1 )
+          END DO
+          ZX0(1:M,N/2+1:N) = ZF(1:M,1:N-N/2)
+          ZY0(1:M,N/2+1:N) = ZF(1:M,2:N-N/2+1)
+      ELSE
+          CALL ZLARNV(2, ISEED, M, ZF(1,1) )
+          DO i = 1, N
+             CALL ZGEMV( 'N', M, M, ZONE, ZA, M, ZF(1,i), 1,  &
+                  ZZERO, ZF(1,i+1), 1 )
+          END DO
+          ZF0(1:M,1:N+1) = ZF(1:M,1:N+1)
+          ZX0(1:M,1:N) = ZF0(1:M,1:N)
+          ZY0(1:M,1:N) = ZF0(1:M,2:N+1)
+      END IF
+
+      DEALLOCATE( ZEIGSA )
+!........................................................................
+
+      DO iJOBZ = 1, 4
+
+          SELECT CASE ( iJOBZ )
+          CASE(1)
+              JOBZ   = 'V'
+              RESIDS = 'R'
+          CASE(2)
+              JOBZ   = 'V'
+              RESIDS = 'N'
+          CASE(3)
+              JOBZ   = 'F'
+              RESIDS = 'N'
+          CASE(4)
+              JOBZ   = 'N'
+              RESIDS = 'N'
+          END SELECT
+
+      DO iJOBREF = 1, 3
+
+          SELECT CASE ( iJOBREF )
+          CASE(1)
+              JOBREF = 'R'
+          CASE(2)
+              JOBREF = 'E'
+          CASE(3)
+              JOBREF = 'N'
+          END SELECT
+
+      DO iSCALE = 1, 4
+
+          SELECT CASE ( iSCALE )
+          CASE(1)
+              SCALE = 'S'
+          CASE(2)
+              SCALE = 'C'
+          CASE(3)
+              SCALE = 'Y'
+          CASE(4)
+              SCALE = 'N'
+          END SELECT
+
+      DO iNRNK = -1, -2, -1
+         NRNK   = iNRNK
+         NRNKsp = iNRNK
+
+      DO iWHTSVD = 1,  3
+         ! Check all four options to compute the POD basis
+         ! via the SVD.
+         WHTSVD   = iWHTSVD
+         WHTSVDsp = iWHTSVD
+
+      DO LWMINOPT = 1, 2
+         ! Workspace query for the minimal (1) and for the optimal
+         ! (2) workspace lengths determined by workspace query.
+
+      ! ZGEDMD is always tested and its results are also used for
+      ! comparisons with ZGEDMDQ.
+
+      ZX(1:M,1:N) = ZX0(1:M,1:N)
+      ZY(1:M,1:N) = ZY0(1:M,1:N)
+
+      CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,   &
+                   M,  N, ZX, LDX, ZY, LDY, NRNK, TOL,    &
+                   K, ZEIGS, ZZ, LDZ,  RES, ZAU, LDAU,    &
+                   ZW,  LDW, ZS, LDS,  ZDUMMY, -1,        &
+                   WDUMMY, -1, IDUMMY, -1, INFO )
+      IF ( (INFO .EQ. 2) .OR. ( INFO .EQ. 3 ) &
+                          .OR. ( INFO < 0 ) ) THEN
+           WRITE(*,*) 'Call to ZGEDMD workspace query failed. &
+                      &Check the calling sequence and the code.'
+           WRITE(*,*) 'The error code is ', INFO
+           WRITE(*,*) 'The input parameters were ',      &
+           SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,          &
+           M, N, LDX, LDY, NRNK, TOL, LDZ, LDAU, LDW, LDS
+           STOP
+      END IF
+
+      LZWORK = INT(ZDUMMY(LWMINOPT))
+      LWORK  = INT(WDUMMY(1))
+      LIWORK = IDUMMY(1)
+
+      ALLOCATE(ZWORK(LZWORK))
+      ALLOCATE( WORK(LWORK))
+      ALLOCATE(IWORK(LIWORK))
+
+      CALL ZGEDMD( SCALE, JOBZ, RESIDS, JOBREF, WHTSVD,  &
+                   M,  N, ZX, LDX, ZY, LDY, NRNK, TOL,   &
+                   K, ZEIGS, ZZ, LDZ,  RES, ZAU, LDAU,   &
+                   ZW,  LDW,  ZS, LDS, ZWORK,  LZWORK,   &
+                   WORK, LWORK, IWORK, LIWORK, INFO )
+
+      IF ( INFO /= 0 ) THEN
+           WRITE(*,*) 'Call to ZGEDMD failed. &
+           &Check the calling sequence and the code.'
+           WRITE(*,*) 'The error code is ', INFO
+           WRITE(*,*) 'The input parameters were ',&
+           SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+           M, N, LDX, LDY, NRNK, TOL
+           STOP
+      END IF
+
+      SINGVX(1:N) = WORK(1:N)
+
+      !...... ZGEDMD check point
+      IF ( LSAME(JOBZ,'V')  ) THEN
+          ! Check that Z = X*W, on return from ZGEDMD
+          ! This checks that the returned eigenvectors in Z are
+          ! the product of the SVD'POD basis returned in X
+          ! and the eigenvectors of the rayleigh quotient
+          ! returned in W
+          CALL ZGEMM( 'N', 'N', M, K, K, ZONE, ZX, LDX, ZW, LDW, &
+                      ZZERO, ZZ1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+             CALL ZAXPY( M, -ZONE, ZZ(1,i), 1, ZZ1(1,i), 1)
+             TMP = MAX(TMP, DZNRM2( M, ZZ1(1,i), 1 ) )
+          END DO
+          TMP_ZXW = MAX(TMP_ZXW, TMP )
+          IF ( TMP_ZXW <= 10*M*EPS ) THEN
+              !WRITE(*,*) ' :) .... OK .........ZGEDMD PASSED.'
+          ELSE
+              NFAIL_Z_XV = NFAIL_Z_XV + 1
+              WRITE(*,*) ':( .................ZGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+      END IF
+
+
+      !...... ZGEDMD check point
+      IF ( LSAME(JOBREF,'R') ) THEN
+           ! The matrix A*U is returned for computing refined Ritz vectors.
+           ! Check that A*U is computed correctly using the formula
+           ! A*U = Y * V * inv(SIGMA). This depends on the
+           ! accuracy in the computed singular values and vectors of X.
+           ! See the paper for an error analysis.
+           ! Note that the left singular vectors of the input matrix X
+           ! are returned in the array X.
+           CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZX, LDX, &
+                      ZZERO, ZZ1, LDZ )
+          TMP = ZERO
+          DO i = 1, K
+            CALL ZAXPY( M, -ZONE, ZAU(1,i), 1, ZZ1(1,i), 1)
+            TMP = MAX( TMP, DZNRM2( M, ZZ1(1,i),1 ) * &
+                     SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_AU = MAX( TMP_AU, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK .........ZGEDMD PASSED.'
+          ELSE
+              NFAIL_AU = NFAIL_AU + 1
+              WRITE(*,*) ':( .................ZGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+      ELSEIF ( LSAME(JOBREF,'E') ) THEN
+       ! The unscaled vectors of the Exact DMD are computed.
+       ! This option is included for the sake of completeness,
+       ! for users who prefer the Exact DMD vectors. The
+       ! returned vectors are in the real form, in the same way
+       ! as the Ritz vectors. Here we just save the vectors
+       ! and test them separately using a Matlab script.
+
+
+       CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZAU, LDAU, ZZERO, ZY1, LDY )
+
+               DO i=1, K
+                  ! have a real eigenvalue with real eigenvector
+                CALL ZAXPY( M, -ZEIGS(i), ZAU(1,i), 1, ZY1(1,i), 1 )
+                RESEX(i) = DZNRM2( M, ZY1(1,i), 1) / DZNRM2(M,ZAU(1,i),1)
+               END DO
+      END IF
+      !...... ZGEDMD check point
+
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by ZGEDMD with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL ZGEMM( 'N', 'N', M, K, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in ZGEDMD,)
+
+          DO i=1, K
+                ! have a real eigenvalue with real eigenvector
+                CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 )
+                RES1(i) = DZNRM2( M, ZY1(1,i), 1)
+          END DO
+          TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+                    SINGVX(K)/(ANORM*SINGVX(1)) )
+          END DO
+          TMP_REZ = MAX( TMP_REZ, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) ':) .... OK ..........ZGEDMD PASSED.'
+          ELSE
+              NFAIL_REZ = NFAIL_REZ + 1
+              WRITE(*,*) ':( ..................ZGEDMD FAILED!', &
+                  'Check the code for implementation errors.'
+              WRITE(*,*) 'The input parameters were ',&
+                 SCALE, JOBZ, RESIDS, JOBREF, WHTSVD, &
+                 M, N, LDX, LDY, NRNK, TOL
+          END IF
+
+
+         IF ( LSAME(JOBREF,'E') ) THEN
+            TMP = ZERO
+          DO i = 1, K
+          TMP = MAX( TMP, ABS(RES1(i) - RESEX(i))/(RES1(i)+RESEX(i)) )
+          END DO
+          TMP_EX = MAX(TMP_EX,TMP)
+         END IF
+
+      END IF
+
+      DEALLOCATE(ZWORK)
+      DEALLOCATE(WORK)
+      DEALLOCATE(IWORK)
+
+      IF ( TEST_QRDMD .AND. (K_TRAJ == 1) ) THEN
+
+      ZF(1:M,1:N+1) = ZF0(1:M,1:N+1)
+
+      CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                    WHTSVD, M, N+1, ZF, LDF,  ZX, LDX,  ZY, LDY,  &
+                    NRNK,  TOL, K, ZEIGS, ZZ, LDZ, RES,  ZAU,  &
+                    LDAU, ZW, LDW, ZS, LDS, ZDUMMY, -1,   &
+                    WDUMMY,  -1, IDUMMY, -1, INFO )
+
+      LZWORK = INT(ZDUMMY(LWMINOPT))
+      ALLOCATE( ZWORK(LZWORK) )
+      LIWORK = IDUMMY(1)
+      ALLOCATE(IWORK(LIWORK))
+      LWORK = INT(WDUMMY(1))
+      ALLOCATE(WORK(LWORK))
+
+      CALL ZGEDMDQ( SCALE, JOBZ, RESIDS, WANTQ, WANTR, JOBREF, &
+                    WHTSVD, M, N+1, ZF, LDF,  ZX, LDX,  ZY, LDY,  &
+                    NRNK,  TOL, KQ, ZEIGS, ZZ, LDZ, RES,  ZAU,  &
+                    LDAU, ZW, LDW, ZS, LDS, ZWORK, LZWORK,   &
+                    WORK,  LWORK, IWORK, LIWORK, INFO )
+
+      IF ( INFO /= 0 ) THEN
+             WRITE(*,*) 'Call to ZGEDMDQ failed. &
+             &Check the calling sequence and the code.'
+             WRITE(*,*) 'The error code is ', INFO
+             WRITE(*,*) 'The input parameters were ',&
+             SCALE, JOBZ, RESIDS, WANTQ, WANTR, WHTSVD, &
+             M, N, LDX, LDY, NRNK, TOL
+             STOP
+      END IF
+      SINGVQX(1:N) = WORK(1:N)
+
+      !..... ZGEDMDQ check point
+
+          IF ( 1 == 0 ) THEN
+              ! Comparison of ZGEDMD and ZGEDMDQ singular values disabled
+          TMP = ZERO
+          DO i = 1, MIN(K, KQ)
+             TMP = MAX(TMP, ABS(SINGVX(i)-SINGVQX(i)) / &
+                                   SINGVX(1) )
+          END DO
+          SVDIFF = MAX( SVDIFF, TMP )
+          IF ( TMP > M*N*EPS ) THEN
+             WRITE(*,*) 'FAILED! Something was wrong with the run.'
+             NFAIL_SVDIFF = NFAIL_SVDIFF + 1
+             DO j =1, 3
+                 write(*,*) j, SINGVX(j), SINGVQX(j)
+                 read(*,*)
+             END DO
+
+          END IF
+          END IF
+
+      !..... ZGEDMDQ check point
+      IF ( LSAME(WANTQ,'Q') .AND. LSAME(WANTR,'R') ) THEN
+         ! Check that the QR factors are computed and returned
+         ! as requested. The residual ||F-Q*R||_F / ||F||_F
+         ! is compared to M*N*EPS.
+         ZF1(1:M,1:N+1) = ZF0(1:M,1:N+1)
+         CALL ZGEMM( 'N', 'N', M, N+1, MIN(M,N+1), -ZONE, ZF, &
+                     LDF, ZY, LDY, ZONE, ZF1, LDF )
+         TMP_FQR = ZLANGE( 'F', M, N+1, ZF1, LDF, WORK ) / &
+               ZLANGE( 'F', M, N+1, ZF0,  LDF, WORK )
+         IF ( TMP_FQR > TOL2 ) THEN
+              WRITE(*,*) 'FAILED! Something was wrong with the run.'
+             NFAIL_F_QR = NFAIL_F_QR + 1
+         ELSE
+             !WRITE(*,*) '........ PASSED.'
+         END IF
+      END IF
+
+      !..... ZGEDMDQ check point
+      IF ( LSAME(RESIDS, 'R') ) THEN
+          ! Compare the residuals returned by ZGEDMDQ with the
+          ! explicitly computed residuals using the matrix A.
+          ! Compute explicitly Y1 = A*Z
+          CALL ZGEMM( 'N', 'N', M, KQ, M, ZONE, ZA, LDA, ZZ, LDZ, ZZERO, ZY1, LDY )
+          ! ... and then A*Z(:,i) - LAMBDA(i)*Z(:,i), using the real forms
+          ! of the invariant subspaces that correspond to complex conjugate
+          ! pairs of eigencalues. (See the description of Z in ZGEDMDQ)
+
+          DO i=1, KQ
+                ! have a real eigenvalue with real eigenvector
+                CALL ZAXPY( M, -ZEIGS(i), ZZ(1,i), 1, ZY1(1,i), 1 )
+                ! Y(1:M,i) = Y(1:M,i) - REIG(i)*Z(1:M,i)
+                RES1(i) = DZNRM2( M, ZY1(1,i), 1)
+          END DO
+          TMP = ZERO
+          DO i = 1, KQ
+          TMP = MAX( TMP, ABS(RES(i) - RES1(i)) * &
+              SINGVQX(KQ)/(ANORM*SINGVQX(1)) )
+          END DO
+          TMP_REZQ = MAX( TMP_REZQ, TMP )
+          IF ( TMP <= TOL2 ) THEN
+              !WRITE(*,*) '.... OK ........ ZGEDMDQ PASSED.'
+          ELSE
+              NFAIL_REZQ = NFAIL_REZQ + 1
+              WRITE(*,*) '................ ZGEDMDQ FAILED!', &
+                  'Check the code for implementation errors.'
+              STOP
+          END IF
+
+      END IF
+
+      DEALLOCATE( ZWORK )
+      DEALLOCATE( WORK  )
+      DEALLOCATE( IWORK )
+
+      END IF ! ZGEDMDQ
+
+!.......................................................................................................
+
+      END DO   ! LWMINOPT
+      !write(*,*) 'LWMINOPT loop completed'
+      END DO   ! iWHTSVD
+      !write(*,*) 'WHTSVD loop completed'
+      END DO   ! iNRNK  -2:-1
+      !write(*,*) 'NRNK loop completed'
+      END DO   ! iSCALE  1:4
+      !write(*,*) 'SCALE loop completed'
+      END DO
+      !write(*,*) 'JOBREF loop completed'
+      END DO   ! iJOBZ
+      !write(*,*) 'JOBZ loop completed'
+
+      END DO ! MODE -6:6
+      !write(*,*) 'MODE loop completed'
+      END DO ! 1 or 2 trajectories
+      !write(*,*) 'trajectories  loop completed'
+
+      DEALLOCATE( ZA )
+      DEALLOCATE( ZAC )
+      DEALLOCATE( ZZ )
+      DEALLOCATE( ZF )
+      DEALLOCATE( ZF0 )
+      DEALLOCATE( ZF1 )
+      DEALLOCATE( ZX )
+      DEALLOCATE( ZX0 )
+      DEALLOCATE( ZY )
+      DEALLOCATE( ZY0 )
+      DEALLOCATE( ZY1 )
+      DEALLOCATE( ZAU )
+      DEALLOCATE( ZW )
+      DEALLOCATE( ZS )
+      DEALLOCATE( ZZ1 )
+      DEALLOCATE( RES )
+      DEALLOCATE( RES1 )
+      DEALLOCATE( RESEX )
+      DEALLOCATE( ZEIGS )
+      DEALLOCATE( SINGVX )
+      DEALLOCATE( SINGVQX )
+
+      END DO ! LLOOP
+
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for ZGEDMD :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+      IF ( NFAIL_Z_XV == 0 ) THEN
+         WRITE(*,*) '>>>> Z - U*V test PASSED.'
+      ELSE
+         WRITE(*,*) 'Z - U*V test FAILED ', NFAIL_Z_XV, ' time(s)'
+         WRITE(*,*) 'Max error ||Z-U*V||_F was ', TMP_ZXW
+         NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_Z_XV
+      END IF
+      IF ( NFAIL_AU == 0 ) THEN
+        WRITE(*,*) '>>>> A*U test PASSED. '
+      ELSE
+        WRITE(*,*) 'A*U test FAILED ', NFAIL_AU, ' time(s)'
+        WRITE(*,*) 'Max A*U test adjusted error measure was ', TMP_AU
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_AU
+      END IF
+
+      IF ( NFAIL_REZ == 0 ) THEN
+        WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+        WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZ, 'time(s)'
+        WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZ
+        WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+        NFAIL_TOTAL = NFAIL_TOTAL + NFAIL_REZ
+      END IF
+
+      IF ( NFAIL_TOTAL == 0 ) THEN
+        WRITE(*,*) '>>>> ZGEDMD :: ALL TESTS PASSED.'
+      ELSE
+        WRITE(*,*) NFAIL_TOTAL, 'FAILURES!'
+        WRITE(*,*) '>>>>>>>>>>>>>> ZGEDMD :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      IF ( TEST_QRDMD ) THEN
+      WRITE(*,*)
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*) ' Test summary for ZGEDMDQ :'
+      WRITE(*,*) '>>>>>>>>>>>>>>>>>>>>>>>>>>'
+      WRITE(*,*)
+
+      IF ( NFAIL_SVDIFF == 0 ) THEN
+          WRITE(*,*) '>>>> ZGEDMD and ZGEDMDQ computed singular &
+              &values test PASSED.'
+      ELSE
+         WRITE(*,*) 'ZGEDMD and ZGEDMDQ discrepancies in &
+             &the singular values unacceptable ', &
+             NFAIL_SVDIFF, ' times. Test FAILED.'
+         WRITE(*,*) 'The maximal discrepancy in the singular values (relative to the norm) was ', SVDIFF
+         WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+         NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_SVDIFF
+      END IF
+
+      IF ( NFAIL_F_QR == 0 ) THEN
+          WRITE(*,*) '>>>> F - Q*R test PASSED.'
+      ELSE
+          WRITE(*,*) 'F - Q*R test FAILED ', NFAIL_F_QR, ' time(s)'
+          WRITE(*,*) 'The largest relative residual was ', TMP_FQR
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_F_QR
+      END IF
+
+      IF ( NFAIL_REZQ == 0 ) THEN
+          WRITE(*,*) '>>>> Rezidual computation test PASSED.'
+      ELSE
+          WRITE(*,*) 'Rezidual computation test FAILED ', NFAIL_REZQ, 'time(s)'
+          WRITE(*,*) 'Max residual computing test adjusted error measure was ', TMP_REZQ
+          WRITE(*,*) 'It should be up to O(M*N) times EPS, EPS = ', EPS
+          NFAILQ_TOTAL = NFAILQ_TOTAL + NFAIL_REZQ
+      END IF
+
+      IF ( NFAILQ_TOTAL == 0 ) THEN
+          WRITE(*,*) '>>>>>>> ZGEDMDQ :: ALL TESTS PASSED.'
+      ELSE
+         WRITE(*,*) NFAILQ_TOTAL, 'FAILURES!'
+         WRITE(*,*) '>>>>>>> ZGEDMDQ :: TESTS FAILED. CHECK THE IMPLEMENTATION.'
+      END IF
+
+      END IF
+
+      WRITE(*,*)
+      WRITE(*,*) 'Test completed.'
+      STOP
+      END
diff --git a/lapack-netlib/TESTING/EIG/zchkst.f b/lapack-netlib/TESTING/EIG/zchkst.f
index 60496dde1..b77d94438 100644
--- a/lapack-netlib/TESTING/EIG/zchkst.f
+++ b/lapack-netlib/TESTING/EIG/zchkst.f
@@ -364,7 +364,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -662,11 +662,10 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF,
-     $                   XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD,
-     $                   ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC,
-     $                   ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR,
-     $                   ZUPGTR
+      EXTERNAL           DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA,
+     $                   ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY,
+     $                   ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR,
+     $                   ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT
@@ -733,7 +732,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/zchkst2stg.f b/lapack-netlib/TESTING/EIG/zchkst2stg.f
index b1ef80816..3b333ef1c 100644
--- a/lapack-netlib/TESTING/EIG/zchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/zchkst2stg.f
@@ -385,7 +385,7 @@
 *> \verbatim
 *>          D1 is DOUBLE PRECISION array of
 *>                             dimension( max(NN) )
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *> \endverbatim
@@ -683,11 +683,11 @@
       EXTERNAL           ILAENV, DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DCOPY, DLABAD, DLASUM, DSTEBZ, DSTECH, DSTERF,
-     $                   XERBLA, ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD,
-     $                   ZLACPY, ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC,
-     $                   ZSTEMR, ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR,
-     $                   ZUPGTR, ZHETRD_2STAGE, DLASET
+      EXTERNAL           DCOPY, DLASUM, DSTEBZ, DSTECH, DSTERF, XERBLA,
+     $                   ZCOPY, ZHET21, ZHETRD, ZHPT21, ZHPTRD, ZLACPY,
+     $                   ZLASET, ZLATMR, ZLATMS, ZPTEQR, ZSTEDC, ZSTEMR,
+     $                   ZSTEIN, ZSTEQR, ZSTT21, ZSTT22, ZUNGTR, ZUPGTR,
+     $                   ZHETRD_2STAGE, DLASET
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DCONJG, INT, LOG, MAX, MIN, SQRT
@@ -754,7 +754,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = ONE / UNFL
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       LOG2UI = INT( LOG( ULPINV ) / LOG( TWO ) )
diff --git a/lapack-netlib/TESTING/EIG/zdrgsx.f b/lapack-netlib/TESTING/EIG/zdrgsx.f
index 096ed3c22..a486873fa 100644
--- a/lapack-netlib/TESTING/EIG/zdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/zdrgsx.f
@@ -394,7 +394,7 @@
       EXTERNAL           ZLCTSX, ILAENV, DLAMCH, ZLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, XERBLA, ZGESVD, ZGET51, ZGGESX,
+      EXTERNAL           ALASVM, XERBLA, ZGESVD, ZGET51, ZGGESX,
      $                   ZLACPY, ZLAKF2, ZLASET, ZLATM5
 *     ..
 *     .. Scalars in Common ..
@@ -479,7 +479,6 @@
       ULPINV = ONE / ULP
       SMLNUM = DLAMCH( 'S' ) / ULP
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
       THRSH2 = TEN*THRESH
       NTESTT = 0
       NERRS = 0
@@ -918,7 +917,7 @@
      $      / '  2:  A and B are upper triangular matrices, ',
      $      / '  3:  A and B are as type 2, but each second diagonal ',
      $      'block in A_11 and ', /
-     $      '      each third diaongal block in A_22 are 2x2 blocks,',
+     $      '      each third diagonal block in A_22 are 2x2 blocks,',
      $      / '  4:  A and B are block diagonal matrices, ',
      $      / '  5:  (A,B) has potentially close or common ',
      $      'eigenvalues.', / )
diff --git a/lapack-netlib/TESTING/EIG/zdrgvx.f b/lapack-netlib/TESTING/EIG/zdrgvx.f
index 72e0303fb..813151f13 100644
--- a/lapack-netlib/TESTING/EIG/zdrgvx.f
+++ b/lapack-netlib/TESTING/EIG/zdrgvx.f
@@ -55,7 +55,7 @@
 *>
 *>    | l**H * (beta A - alpha B) | / ( ulp max( |beta A|, |alpha B| ) )
 *>
-*>     where l**H is the conjugate tranpose of l.
+*>     where l**H is the conjugate transpose of l.
 *>
 *> (2) max over all right eigenvalue/-vector pairs (beta/alpha,r) of
 *>
diff --git a/lapack-netlib/TESTING/EIG/zdrvsg.f b/lapack-netlib/TESTING/EIG/zdrvsg.f
index 71f1d6371..1796805ea 100644
--- a/lapack-netlib/TESTING/EIG/zdrvsg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvsg.f
@@ -236,7 +236,7 @@
 *>
 *>  B       COMPLEX*16 array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -418,7 +418,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
+      EXTERNAL           DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
      $                   ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD,
      $                   ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01
 *     ..
@@ -481,7 +481,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f
index 4bdf2849e..c5ef4ce70 100644
--- a/lapack-netlib/TESTING/EIG/zdrvsg2stg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvsg2stg.f
@@ -242,7 +242,7 @@
 *>
 *>  B       COMPLEX*16 array, dimension (LDB , max(NN))
 *>          Used to hold the Hermitian positive definite matrix for
-*>          the generailzed problem.
+*>          the generalized problem.
 *>          On exit, B contains the last matrix actually
 *>          used.
 *>          Modified.
@@ -426,7 +426,7 @@
       EXTERNAL           LSAME, DLAMCH, DLARND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLABAD, DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
+      EXTERNAL           DLAFTS, DLASUM, XERBLA, ZHBGV, ZHBGVD,
      $                   ZHBGVX, ZHEGV, ZHEGVD, ZHEGVX, ZHPGV, ZHPGVD,
      $                   ZHPGVX, ZLACPY, ZLASET, ZLATMR, ZLATMS, ZSGT01,
      $                   ZHEGV_2STAGE
@@ -490,7 +490,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zdrvst.f b/lapack-netlib/TESTING/EIG/zdrvst.f
index 384e58de1..f838f0af2 100644
--- a/lapack-netlib/TESTING/EIG/zdrvst.f
+++ b/lapack-netlib/TESTING/EIG/zdrvst.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -391,10 +391,10 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD,
-     $                   ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21,
-     $                   ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET,
-     $                   ZLATMR, ZLATMS
+      EXTERNAL           ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD, ZHBEVX,
+     $                   ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21, ZHET22,
+     $                   ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET, ZLATMR,
+     $                   ZLATMS
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, INT, LOG, MAX, MIN, SQRT
@@ -451,7 +451,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zdrvst2stg.f b/lapack-netlib/TESTING/EIG/zdrvst2stg.f
index 4a88e5218..4b989b460 100644
--- a/lapack-netlib/TESTING/EIG/zdrvst2stg.f
+++ b/lapack-netlib/TESTING/EIG/zdrvst2stg.f
@@ -204,7 +204,7 @@
 *>          Not modified.
 *>
 *>  D1      DOUBLE PRECISION array, dimension (max(NN))
-*>          The eigenvalues of A, as computed by ZSTEQR simlutaneously
+*>          The eigenvalues of A, as computed by ZSTEQR simultaneously
 *>          with Z.  On exit, the eigenvalues in D1 correspond with the
 *>          matrix in A.
 *>          Modified.
@@ -391,7 +391,7 @@
       EXTERNAL           DLAMCH, DLARND, DSXT1
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALASVM, DLABAD, DLAFTS, XERBLA, ZHBEV, ZHBEVD,
+      EXTERNAL           ALASVM, DLAFTS, XERBLA, ZHBEV, ZHBEVD,
      $                   ZHBEVX, ZHEEV, ZHEEVD, ZHEEVR, ZHEEVX, ZHET21,
      $                   ZHET22, ZHPEV, ZHPEVD, ZHPEVX, ZLACPY, ZLASET,
      $                   ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE,
@@ -453,7 +453,6 @@
 *
       UNFL = DLAMCH( 'Safe minimum' )
       OVFL = DLAMCH( 'Overflow' )
-      CALL DLABAD( UNFL, OVFL )
       ULP = DLAMCH( 'Epsilon' )*DLAMCH( 'Base' )
       ULPINV = ONE / ULP
       RTUNFL = SQRT( UNFL )
diff --git a/lapack-netlib/TESTING/EIG/zerrhs.f b/lapack-netlib/TESTING/EIG/zerrhs.f
index 582338947..3a70a556e 100644
--- a/lapack-netlib/TESTING/EIG/zerrhs.f
+++ b/lapack-netlib/TESTING/EIG/zerrhs.f
@@ -21,8 +21,8 @@
 *>
 *> \verbatim
 *>
-*> ZERRHS tests the error exits for ZGEBAK, CGEBAL, CGEHRD, ZUNGHR,
-*> ZUNMHR, ZHSEQR, CHSEIN, and ZTREVC.
+*> ZERRHS tests the error exits for ZGEBAK, ZGEBAL, ZGEHRD, ZGEHD2,
+*> ZUNGHR, ZUNMHR, ZHSEQR, ZHSEIN, ZTREVC, and ZTREVC3.
 *> \endverbatim
 *
 *  Arguments:
@@ -86,7 +86,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, ZGEBAK, ZGEBAL, ZGEHRD, ZHSEIN, ZHSEQR,
-     $                   ZTREVC, ZUNGHR, ZUNMHR
+     $                   ZUNGHR, ZUNMHR, ZTREVC, ZTREVC3
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE
@@ -193,6 +193,29 @@
          CALL CHKXER( 'ZGEHRD', INFOT, NOUT, LERR, OK )
          NT = NT + 7
 *
+*        ZGEHD2
+*
+         SRNAMT = 'ZGEHD2'
+         INFOT = 1
+         CALL ZGEHD2( -1, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGEHD2( 0, 0, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGEHD2( 0, 2, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGEHD2( 1, 1, 0, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGEHD2( 0, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZGEHD2( 2, 1, 1, A, 1, TAU, W, INFO )
+         CALL CHKXER( 'ZGEHD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 6
+*
 *        ZUNGHR
 *
          SRNAMT = 'ZUNGHR'
@@ -389,6 +412,47 @@
      $                INFO )
          CALL CHKXER( 'ZTREVC', INFOT, NOUT, LERR, OK )
          NT = NT + 7
+*
+*        ZTREVC3
+*
+         SRNAMT = 'ZTREVC3'
+         INFOT = 1
+         CALL ZTREVC3( '/', 'A', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZTREVC3( 'L', '/', SEL, 0, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZTREVC3( 'L', 'A', SEL, -1, A, 1, VL, 1, VR, 1, 0, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 1, VL, 2, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZTREVC3( 'R', 'A', SEL, 2, A, 2, VL, 1, VR, 1, 4, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 1, M, W,
+     $                 LW, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 2, RW, 2, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         INFOT = 16
+         CALL ZTREVC3( 'L', 'A', SEL, 2, A, 2, VL, 2, VR, 1, 2, M, W,
+     $                 LW, RW, 1, INFO )
+         CALL CHKXER( 'ZTREVC3', INFOT, NOUT, LERR, OK )
+         NT = NT + 9
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f
index 5b0e6f820..31881c4de 100644
--- a/lapack-netlib/TESTING/EIG/zerrst.f
+++ b/lapack-netlib/TESTING/EIG/zerrst.f
@@ -23,7 +23,7 @@
 *>
 *> \verbatim
 *>
-*> ZERRST tests the error exits for ZHETRD, ZUNGTR, CUNMTR, ZHPTRD,
+*> ZERRST tests the error exits for ZHETRD, ZHETD2, ZUNGTR, CUNMTR, ZHPTRD,
 *> ZUNGTR, ZUPMTR, ZSTEQR, CSTEIN, ZPTEQR, ZHBTRD,
 *> ZHEEV, CHEEVX, CHEEVD, ZHBEV, CHBEVX, CHBEVD,
 *> ZHPEV, CHPEVX, CHPEVD, and ZSTEDC.
@@ -95,7 +95,7 @@
       EXTERNAL           CHKXER, ZHBEV, ZHBEVD, ZHBEVX, ZHBTRD, ZHEEV,
      $                   ZHEEVD, ZHEEVR, ZHEEVX, ZHETRD, ZHPEV, ZHPEVD,
      $                   ZHPEVX, ZHPTRD, ZPTEQR, ZSTEDC, ZSTEIN, ZSTEQR,
-     $                   ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR,
+     $                   ZUNGTR, ZUNMTR, ZUPGTR, ZUPMTR, ZHETD2,
      $                   ZHEEVD_2STAGE, ZHEEVR_2STAGE, ZHEEVX_2STAGE,
      $                   ZHEEV_2STAGE, ZHBEV_2STAGE, ZHBEVD_2STAGE,
      $                   ZHBEVX_2STAGE, ZHETRD_2STAGE
@@ -156,6 +156,20 @@
          CALL CHKXER( 'ZHETRD', INFOT, NOUT, LERR, OK )
          NT = NT + 4
 *
+*        ZHETD2
+*
+         SRNAMT = 'ZHETD2'
+         INFOT = 1
+         CALL ZHETD2( '/', 0, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, INFO )
+         CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK )
+         NT = NT + 3
+*
 *        ZHETRD_2STAGE
 *
          SRNAMT = 'ZHETRD_2STAGE'
@@ -734,17 +748,17 @@
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 18
          CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL ZHEEVR( 'V', 'I', 'U', 1, A, 1, 0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 2*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'ZHEEVR', INFOT, NOUT, LERR, OK )
          NT = NT + 12
@@ -816,19 +830,19 @@
          INFOT = 18
          CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 2*N-1, RW, 24*N, IW( 2*N+1 ),
+     $                M, R, Z, 1, IW, Q, 0, RW, 24*N, IW( 2*N+1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 20
          CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N-1, IW( 2*N-1 ),
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 0, IW( 2*N-1 ),
      $                10*N, INFO )
          CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 22
          CALL ZHEEVR_2STAGE( 'N', 'I', 'U', 1, A, 1,
      $                0.0D0, 0.0D0, 1, 1, 0.0D0,
-     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 10*N-1,
+     $                M, R, Z, 1, IW, Q, 26*N, RW, 24*N, IW, 0,
      $                INFO )
          CALL CHKXER( 'ZHEEVR_2STAGE', INFOT, NOUT, LERR, OK )
          NT = NT + 13
diff --git a/lapack-netlib/TESTING/EIG/zsyl01.f b/lapack-netlib/TESTING/EIG/zsyl01.f
index 329f39dc4..5d26d494c 100644
--- a/lapack-netlib/TESTING/EIG/zsyl01.f
+++ b/lapack-netlib/TESTING/EIG/zsyl01.f
@@ -120,14 +120,16 @@
       COMPLEX*16         RMUL
 *     ..
 *     .. Local Arrays ..
-      COMPLEX*16         A( MAXM, MAXM ), B( MAXN, MAXN ),
-     $                   C( MAXM, MAXN ), CC( MAXM, MAXN ),
-     $                   X( MAXM, MAXN ),
-     $                   DUML( MAXM ), DUMR( MAXN ),
+      COMPLEX*16         DUML( MAXM ), DUMR( MAXN ),
      $                   D( MAX( MAXM, MAXN ) )
-      DOUBLE PRECISION   SWORK( LDSWORK, 103 ), DUM( MAXN ), VM( 2 )
+      DOUBLE PRECISION   DUM( MAXN ), VM( 2 )
       INTEGER            ISEED( 4 ), IWORK( MAXM + MAXN + 2 )
 *     ..
+*     .. Allocatable Arrays ..
+      INTEGER            AllocateStatus
+      COMPLEX*16,       DIMENSION(:,:), ALLOCATABLE :: A, B, C, CC, X
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: SWORK
+*     ..
 *     .. External Functions ..
       LOGICAL            DISNAN
       DOUBLE PRECISION   DLAMCH, ZLANGE
@@ -139,6 +141,20 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, MAX, SQRT
 *     ..
+*     .. Allocate memory dynamically ..
+      ALLOCATE ( A( MAXM, MAXM ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( B( MAXN, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( C( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( CC( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( X( MAXM, MAXN ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( SWORK( LDSWORK, 103 ), STAT = AllocateStatus )
+      IF( AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+*     ..
 *     .. Executable Statements ..
 *
 *     Get machine parameters
@@ -286,6 +302,13 @@
             END DO
          END DO
       END DO
+*
+      DEALLOCATE (A, STAT = AllocateStatus)
+      DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (C, STAT = AllocateStatus)
+      DEALLOCATE (CC, STAT = AllocateStatus)
+      DEALLOCATE (X, STAT = AllocateStatus)
+      DEALLOCATE (SWORK, STAT = AllocateStatus)
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index fc55b8a96..e406570e1 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -9,7 +9,7 @@ set(DZLNTST dlaord.f)
 set(SLINTST schkaa.F
    schkeq.f schkgb.f schkge.f schkgt.f
    schklq.f schkpb.f schkpo.f schkps.f schkpp.f
-   schkpt.f schkq3.f schkql.f schkqr.f schkrq.f
+   schkpt.f schkq3.f schkqp3rk.f schkql.f schkqr.f schkrq.f
    schksp.f schksy.f schksy_rook.f schksy_rk.f 
    schksy_aa.f schksy_aa_2stage.f 
    schktb.f schktp.f schktr.f
@@ -20,7 +20,7 @@ set(SLINTST schkaa.F
    serrgt.f serrlq.f serrls.f
    serrps.f serrql.f serrqp.f serrqr.f
    serrrq.f serrtr.f serrtz.f
-   sgbt01.f sgbt02.f sgbt05.f sgelqs.f sgeqls.f sgeqrs.f
+   sgbt01.f sgbt02.f sgbt05.f sgeqls.f
    sgerqs.f sget01.f sget02.f
    sget03.f sget04.f sget06.f sget07.f sgtt01.f sgtt02.f
    sgtt05.f slaptm.f slarhs.f slatb4.f slatb5.f slattb.f slattp.f
@@ -56,7 +56,7 @@ set(CLINTST cchkaa.F
    cchkhe.f cchkhe_rook.f cchkhe_rk.f 
    cchkhe_aa.f cchkhe_aa_2stage.f
    cchkhp.f cchklq.f cchkpb.f
-   cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkql.f
+   cchkpo.f cchkps.f cchkpp.f cchkpt.f cchkq3.f cchkqp3rk.f cchkql.f
    cchkqr.f cchkrq.f cchksp.f cchksy.f cchksy_rook.f cchksy_rk.f
    cchksy_aa.f cchksy_aa_2stage.f
    cchktb.f
@@ -70,7 +70,7 @@ set(CLINTST cchkaa.F
    cerrgt.f cerrlq.f
    cerrls.f cerrps.f cerrql.f cerrqp.f
    cerrqr.f cerrrq.f cerrtr.f cerrtz.f
-   cgbt01.f cgbt02.f cgbt05.f cgelqs.f cgeqls.f cgeqrs.f
+   cgbt01.f cgbt02.f cgbt05.f cgeqls.f
    cgerqs.f cget01.f cget02.f
    cget03.f cget04.f cget07.f cgtt01.f cgtt02.f
    cgtt05.f chet01.f chet01_rook.f chet01_3.f
@@ -110,7 +110,7 @@ endif()
 set(DLINTST dchkaa.F
    dchkeq.f dchkgb.f dchkge.f dchkgt.f
    dchklq.f dchkpb.f dchkpo.f dchkps.f dchkpp.f
-   dchkpt.f dchkq3.f dchkql.f dchkqr.f dchkrq.f
+   dchkpt.f dchkq3.f dchkqp3rk.f dchkql.f dchkqr.f dchkrq.f
    dchksp.f dchksy.f dchksy_rook.f dchksy_rk.f 
    dchksy_aa.f dchksy_aa_2stage.f
    dchktb.f dchktp.f dchktr.f
@@ -121,7 +121,7 @@ set(DLINTST dchkaa.F
    derrgt.f derrlq.f derrls.f
    derrps.f derrql.f derrqp.f derrqr.f
    derrrq.f derrtr.f derrtz.f
-   dgbt01.f dgbt02.f dgbt05.f dgelqs.f dgeqls.f dgeqrs.f
+   dgbt01.f dgbt02.f dgbt05.f dgeqls.f
    dgerqs.f dget01.f dget02.f
    dget03.f dget04.f dget06.f dget07.f dgtt01.f dgtt02.f
    dgtt05.f dlaptm.f dlarhs.f dlatb4.f dlatb5.f dlattb.f dlattp.f
@@ -158,7 +158,7 @@ set(ZLINTST zchkaa.F
    zchkhe.f zchkhe_rook.f zchkhe_rk.f 
    zchkhe_aa.f zchkhe_aa_2stage.f
    zchkhp.f zchklq.f zchkpb.f
-   zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkql.f
+   zchkpo.f zchkps.f zchkpp.f zchkpt.f zchkq3.f zchkqp3rk.f zchkql.f
    zchkqr.f zchkrq.f zchksp.f zchksy.f zchksy_rook.f zchksy_rk.f
    zchksy_aa.f  zchksy_aa_2stage.f
    zchktb.f
@@ -172,7 +172,7 @@ set(ZLINTST zchkaa.F
    zerrgt.f zerrlq.f
    zerrls.f zerrps.f zerrql.f zerrqp.f
    zerrqr.f zerrrq.f zerrtr.f zerrtz.f
-   zgbt01.f zgbt02.f zgbt05.f zgelqs.f zgeqls.f zgeqrs.f
+   zgbt01.f zgbt02.f zgbt05.f zgeqls.f
    zgerqs.f zget01.f zget02.f
    zget03.f zget04.f zget07.f zgtt01.f zgtt02.f
    zgtt05.f zhet01.f zhet01_rook.f zhet01_3.f
@@ -239,7 +239,11 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr
 
 macro(add_lin_executable name)
   add_executable(${name} ${ARGN})
-  target_link_libraries(${name} openblas${SUFFIX64_UNDERSCORE})
+  target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
+  if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+	string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+        target_link_libraries(${name} omp pthread)
+  endif()
 #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
 endmacro()
 
diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile
index 54b26455e..714efa52a 100644
--- a/lapack-netlib/TESTING/LIN/Makefile
+++ b/lapack-netlib/TESTING/LIN/Makefile
@@ -45,7 +45,7 @@ DZLNTST = dlaord.o
 SLINTST = schkaa.o \
    schkeq.o schkgb.o schkge.o schkgt.o \
    schklq.o schkpb.o schkpo.o schkps.o schkpp.o \
-   schkpt.o schkq3.o schkql.o schkqr.o schkrq.o \
+   schkpt.o schkq3.o schkqp3rk.o schkql.o schkqr.o schkrq.o \
    schksp.o schksy.o schksy_rook.o schksy_rk.o  \
    schksy_aa.o schksy_aa_2stage.o schktb.o schktp.o schktr.o \
    schktz.o \
@@ -55,7 +55,7 @@ SLINTST = schkaa.o \
    serrgt.o serrlq.o serrls.o \
    serrps.o serrql.o serrqp.o serrqr.o \
    serrrq.o serrtr.o serrtz.o \
-   sgbt01.o sgbt02.o sgbt05.o sgelqs.o sgeqls.o sgeqrs.o \
+   sgbt01.o sgbt02.o sgbt05.o sgeqls.o \
    sgerqs.o sget01.o sget02.o \
    sget03.o sget04.o sget06.o sget07.o sgtt01.o sgtt02.o \
    sgtt05.o slaptm.o slarhs.o slatb4.o slatb5.o slattb.o slattp.o \
@@ -89,7 +89,7 @@ CLINTST = cchkaa.o \
    cchkeq.o cchkgb.o cchkge.o cchkgt.o \
    cchkhe.o cchkhe_rook.o cchkhe_rk.o  \
    cchkhe_aa.o cchkhe_aa_2stage.o cchkhp.o cchklq.o cchkpb.o \
-   cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkql.o \
+   cchkpo.o cchkps.o cchkpp.o cchkpt.o cchkq3.o cchkqp3rk.o cchkql.o \
    cchkqr.o cchkrq.o cchksp.o cchksy.o cchksy_rook.o cchksy_rk.o \
    cchksy_aa.o cchksy_aa_2stage.o cchktb.o \
    cchktp.o cchktr.o cchktz.o \
@@ -100,7 +100,7 @@ CLINTST = cchkaa.o \
    cerrgt.o cerrlq.o \
    cerrls.o cerrps.o cerrql.o cerrqp.o \
    cerrqr.o cerrrq.o cerrtr.o cerrtz.o \
-   cgbt01.o cgbt02.o cgbt05.o cgelqs.o cgeqls.o cgeqrs.o \
+   cgbt01.o cgbt02.o cgbt05.o cgeqls.o \
    cgerqs.o cget01.o cget02.o \
    cget03.o cget04.o cget07.o cgtt01.o cgtt02.o \
    cgtt05.o chet01.o chet01_rook.o chet01_3.o chet01_aa.o \
@@ -137,7 +137,7 @@ endif
 DLINTST = dchkaa.o \
    dchkeq.o dchkgb.o dchkge.o dchkgt.o \
    dchklq.o dchkpb.o dchkpo.o dchkps.o dchkpp.o \
-   dchkpt.o dchkq3.o dchkql.o dchkqr.o dchkrq.o \
+   dchkpt.o dchkq3.o dchkqp3rk.o dchkql.o dchkqr.o dchkrq.o \
    dchksp.o dchksy.o dchksy_rook.o dchksy_rk.o  \
    dchksy_aa.o dchksy_aa_2stage.o dchktb.o dchktp.o dchktr.o \
    dchktz.o \
@@ -147,7 +147,7 @@ DLINTST = dchkaa.o \
    derrgt.o derrlq.o derrls.o \
    derrps.o derrql.o derrqp.o derrqr.o \
    derrrq.o derrtr.o derrtz.o \
-   dgbt01.o dgbt02.o dgbt05.o dgelqs.o dgeqls.o dgeqrs.o \
+   dgbt01.o dgbt02.o dgbt05.o dgeqls.o \
    dgerqs.o dget01.o dget02.o \
    dget03.o dget04.o dget06.o dget07.o dgtt01.o dgtt02.o \
    dgtt05.o dlaptm.o dlarhs.o dlatb4.o dlatb5.o dlattb.o dlattp.o \
@@ -182,7 +182,7 @@ ZLINTST = zchkaa.o \
    zchkeq.o zchkgb.o zchkge.o zchkgt.o \
    zchkhe.o zchkhe_rook.o zchkhe_rk.o zchkhe_aa.o zchkhe_aa_2stage.o \
    zchkhp.o zchklq.o zchkpb.o \
-   zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkql.o \
+   zchkpo.o zchkps.o zchkpp.o zchkpt.o zchkq3.o zchkqp3rk.o zchkql.o \
    zchkqr.o zchkrq.o zchksp.o zchksy.o zchksy_rook.o zchksy_rk.o \
    zchksy_aa.o zchksy_aa_2stage.o zchktb.o \
    zchktp.o zchktr.o zchktz.o \
@@ -192,7 +192,7 @@ ZLINTST = zchkaa.o \
    zerrgt.o zerrlq.o \
    zerrls.o zerrps.o zerrql.o zerrqp.o \
    zerrqr.o zerrrq.o zerrtr.o zerrtz.o \
-   zgbt01.o zgbt02.o zgbt05.o zgelqs.o zgeqls.o zgeqrs.o \
+   zgbt01.o zgbt02.o zgbt05.o zgeqls.o \
    zgerqs.o zget01.o zget02.o \
    zget03.o zget04.o zget07.o zgtt01.o zgtt02.o \
    zgtt05.o zhet01.o zhet01_rook.o zhet01_3.o zhet01_aa.o \
diff --git a/lapack-netlib/TESTING/LIN/alaerh.f b/lapack-netlib/TESTING/LIN/alaerh.f
index 1845888a6..6c8a47f1e 100644
--- a/lapack-netlib/TESTING/LIN/alaerh.f
+++ b/lapack-netlib/TESTING/LIN/alaerh.f
@@ -797,6 +797,18 @@
             WRITE( NOUT, FMT = 9978 )
      $     SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT
          END IF
+*
+      ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN
+*
+*        xQK:  truncated QR factorization with pivoting
+*
+         IF( LSAMEN( 7, SUBNAM( 2: 8 ), 'GEQP3RK' )  ) THEN
+            WRITE( NOUT, FMT = 9930 )
+     $     SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, KL, N5, IMAT
+         ELSE IF( LSAMEN( 5, SUBNAM( 2: 6 ), 'LATMS' ) ) THEN
+            WRITE( NOUT, FMT = 9978 )
+     $     SUBNAM(1:LEN_TRIM( SUBNAM )), INFO, M, N, IMAT
+         END IF
 *
       ELSE IF( LSAMEN( 2, P2, 'LQ' ) ) THEN
 *
@@ -1147,6 +1159,11 @@
 *     What we do next
 *
  9949 FORMAT( ' ==> Doing only the condition estimate for this case' )
+*
+*     SUBNAM, INFO, M, N, NB, IMAT
+*
+ 9930 FORMAT( ' *** Error code from ', A, '=', I5, / ' ==> M =', I5,
+     $      ', N =', I5, ', NX =', I5, ', NB =', I4, ', type ', I2 )
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/LIN/alahd.f b/lapack-netlib/TESTING/LIN/alahd.f
index f0423a23b..8f966c584 100644
--- a/lapack-netlib/TESTING/LIN/alahd.f
+++ b/lapack-netlib/TESTING/LIN/alahd.f
@@ -584,13 +584,27 @@
 *
 *        QR decomposition with column pivoting
 *
-         WRITE( IOUNIT, FMT = 9986 )PATH
+         WRITE( IOUNIT, FMT = 8006 )PATH
          WRITE( IOUNIT, FMT = 9969 )
          WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' )
          WRITE( IOUNIT, FMT = 9940 )1
          WRITE( IOUNIT, FMT = 9939 )2
          WRITE( IOUNIT, FMT = 9938 )3
          WRITE( IOUNIT, FMT = '( '' Messages:'' )' )
+*
+      ELSE IF( LSAMEN( 2, P2, 'QK' ) ) THEN
+*
+*        truncated QR decomposition with column pivoting
+*
+         WRITE( IOUNIT, FMT = 8006 )PATH
+         WRITE( IOUNIT, FMT = 9871 )
+         WRITE( IOUNIT, FMT = '( '' Test ratios:'' )' )
+         WRITE( IOUNIT, FMT = 8060 )1
+         WRITE( IOUNIT, FMT = 8061 )2
+         WRITE( IOUNIT, FMT = 8062 )3
+         WRITE( IOUNIT, FMT = 8063 )4
+         WRITE( IOUNIT, FMT = 8064 )5
+         WRITE( IOUNIT, FMT = '( '' Messages:'' )' )
 *
       ELSE IF( LSAMEN( 2, P2, 'TZ' ) ) THEN
 *
@@ -777,8 +791,10 @@
      $       'triangular-pentagonal matrices' )
  8004 FORMAT( / 1X, A3, ':  TS factorization for ',
      $       'tall-skinny or short-wide matrices' )
- 8005 FORMAT( / 1X, A3, ':  Householder recostruction from TSQR',
+ 8005 FORMAT( / 1X, A3, ':  Householder reconstruction from TSQR',
      $       ' factorization output ', /,' for tall-skinny matrices.' )
+ 8006 FORMAT( / 1X, A3, ':  truncated QR factorization',
+     $        ' with column pivoting' )
 *
 *     GE matrix types
 *
@@ -922,6 +938,36 @@
      $      / 4X, '3. Geometric distribution', 10X,
      $      '6. Every second column fixed' )
 *
+*     QK matrix types
+*
+ 9871 FORMAT( 4X, ' 1. Zero matrix', /
+     $        4X, ' 2. Random, Diagonal, CNDNUM = 2', /
+     $        4X, ' 3. Random, Upper triangular, CNDNUM = 2', /
+     $        4X, ' 4. Random, Lower triangular, CNDNUM = 2', /
+     $        4X, ' 5. Random, First column is zero, CNDNUM = 2', /
+     $        4X, ' 6. Random, Last MINMN column is zero, CNDNUM = 2', /
+     $        4X, ' 7. Random, Last N column is zero, CNDNUM = 2', /
+     $        4X, ' 8. Random, Middle column in MINMN is zero,',
+     $               ' CNDNUM = 2', /
+     $        4X, ' 9. Random, First half of MINMN columns are zero,',
+     $                 ' CNDNUM = 2', /
+     $        4X, '10. Random, Last columns are zero starting from',
+     $                 ' MINMN/2+1, CNDNUM = 2', /
+     $        4X, '11. Random, Half MINMN columns in the middle are',
+     $                 ' zero starting from MINMN/2-(MINMN/2)/2+1,'
+     $                 ' CNDNUM = 2', /
+     $        4X, '12. Random, Odd columns are ZERO, CNDNUM = 2', /
+     $        4X, '13. Random, Even columns are ZERO, CNDNUM = 2', /
+     $        4X, '14. Random, CNDNUM = 2', /
+     $        4X, '15. Random, CNDNUM = sqrt(0.1/EPS)', /
+     $        4X, '16. Random, CNDNUM = 0.1/EPS', /
+     $        4X, '17. Random, CNDNUM = 0.1/EPS,',
+     $                 ' one small singular value S(N)=1/CNDNUM', /
+     $        4X, '18. Random, CNDNUM = 2, scaled near underflow,',
+     $                 ' NORM = SMALL = SAFMIN', /
+     $        4X, '19. Random, CNDNUM = 2, scaled near overflow,',
+     $            ' NORM = LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )' )
+*
 *     TZ matrix types
 *
  9968 FORMAT( ' Matrix types (2-3 have condition 1/EPS):', / 4X,
@@ -1030,9 +1076,8 @@
      $      ' * norm(C) * EPS )' )
  9940 FORMAT( 3X, I2, ': norm(svd(A) - svd(R)) / ',
      $      '( M * norm(svd(R)) * EPS )' )
- 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R )     / ( M * norm(A) * EPS )'
-     $       )
- 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q )      / ( M * EPS )' )
+ 9939 FORMAT( 3X, I2, ': norm( A*P - Q*R ) / ( M * norm(A) * EPS )')
+ 9938 FORMAT( 3X, I2, ': norm( I - Q''*Q ) / ( M * EPS )' )
  9937 FORMAT( 3X, I2, ': norm( A - R*Q )       / ( M * norm(A) * EPS )'
      $       )
  9935 FORMAT( 3X, I2, ': norm( B - A * X )   / ',
@@ -1105,6 +1150,15 @@
  8054 FORMAT(3X,I2,': norm( C*Q - C*Q ) / ( M * norm(C) * EPS )' )
  8055 FORMAT(3X,I2,': norm( C*Q'' - C*Q'' ) / ( M * norm(C) * EPS )')
 
+ 8060 FORMAT( 3X, I2, ': 2-norm(svd(A) - svd(R)) / ',
+     $      '( max(M,N) * 2-norm(svd(R)) * EPS )' )
+ 8061 FORMAT( 3X, I2, ': 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A)',
+     $                ' * EPS )')
+ 8062 FORMAT( 3X, I2, ': 1-norm( I - Q''*Q ) / ( M * EPS )' )
+ 8063 FORMAT( 3X, I2, ': Returns 1.0D+100, if abs(R(K+1,K+1))',
+     $                 ' > abs(R(K,K)), where K=1:KFACT-1' )
+ 8064 FORMAT( 3X, I2, ': 1-norm(Q**T * B - Q**T * B ) / ( M * EPS )')
+
 *
       RETURN
 *
diff --git a/lapack-netlib/TESTING/LIN/alareq.f b/lapack-netlib/TESTING/LIN/alareq.f
index db18775eb..3f057fa48 100644
--- a/lapack-netlib/TESTING/LIN/alareq.f
+++ b/lapack-netlib/TESTING/LIN/alareq.f
@@ -28,12 +28,12 @@
 *> to evaluate the input line which requested NMATS matrix types for
 *> PATH.  The flow of control is as follows:
 *>
-*> If NMATS = NTYPES then
+*> IF NMATS = NTYPES THEN
 *>    DOTYPE(1:NTYPES) = .TRUE.
-*> else
+*> ELSE
 *>    Read the next input line for NMATS matrix types
 *>    Set DOTYPE(I) = .TRUE. for each valid type I
-*> endif
+*> END IF
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/LIN/cchkaa.F b/lapack-netlib/TESTING/LIN/cchkaa.F
index ec1534ed4..57d95c741 100644
--- a/lapack-netlib/TESTING/LIN/cchkaa.F
+++ b/lapack-netlib/TESTING/LIN/cchkaa.F
@@ -69,6 +69,7 @@
 *> CLQ    8               List types on next line if 0 < NTYPES <  8
 *> CQL    8               List types on next line if 0 < NTYPES <  8
 *> CQP    6               List types on next line if 0 < NTYPES <  6
+*> ZQK   19               List types on next line if 0 < NTYPES < 19
 *> CTZ    3               List types on next line if 0 < NTYPES <  3
 *> CLS    6               List types on next line if 0 < NTYPES <  6
 *> CEQ
@@ -153,12 +154,11 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      REAL               S( 2*NMAX )
-      COMPLEX            E( NMAX )
 *     ..
 *     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      REAL, DIMENSION(:), ALLOCATABLE :: RWORK
+      REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      COMPLEX, DIMENSION(:), ALLOCATABLE :: E
       COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
@@ -170,14 +170,14 @@
       EXTERNAL           ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE,
      $                   CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP,
      $                   CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS,
-     $                   CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ,
-     $                   CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK,
-     $                   CCHKSY_AA, CCHKTB,  CCHKTP, CCHKTR, CCHKTZ,
-     $                   CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK,
-     $                   CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB,
-     $                   CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY,
-     $                   CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER,
-     $                   CCHKQRT, CCHKQRTP
+     $                   CCHKPP, CCHKPT, CCHKQ3, CCHKQP3RK, CCHKQL,
+     $                   CCHKQR, CCHKRQ, CCHKSP, CCHKSY, CCHKSY_ROOK,
+     $                   CCHKSY_RK, CCHKSY_AA, CCHKTB, CCHKTP, CCHKTR,
+     $                   CCHKTZ, CDRVGB, CDRVGE, CDRVGT, CDRVHE,
+     $                   CDRVHE_ROOK, CDRVHE_RK, CDRVHE_AA, CDRVHP,
+     $                   CDRVLS, CDRVPB, CDRVPO, CDRVPP, CDRVPT, CDRVSP,
+     $                   CDRVSY, CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA,
+     $                   ILAVER, CCHKQRT, CCHKQRTP
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -203,6 +203,10 @@
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( WORK( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *     ..
@@ -1109,6 +1113,23 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                   NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                   A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                   S( 1 ), B( 1, 4 ),
+     $                   WORK, RWORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN
 *
@@ -1211,6 +1232,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (WORK, STAT = AllocateStatus)
       DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f
index 30a61261f..d79978e55 100644
--- a/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/cchkhe_aa_2stage.f
@@ -433,9 +433,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'CHETRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
-                  CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX)
+                  CALL CHETRF_AA_2STAGE( UPLO, N, AFAC, LDA,
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
@@ -517,7 +517,6 @@ c                  NT = 1
                      CALL CLACPY( 'Full', N, NRHS, B, LDA, X, LDA )
 *
                      SRNAMT = 'CHETRS_AA_2STAGE'
-                     LWORK = MAX( 1, 3*N-2 )
                      CALL CHETRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
      $                            AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ),
      $                            X, LDA, INFO )
diff --git a/lapack-netlib/TESTING/LIN/cchklq.f b/lapack-netlib/TESTING/LIN/cchklq.f
index 54107d047..4499de36f 100644
--- a/lapack-netlib/TESTING/LIN/cchklq.f
+++ b/lapack-netlib/TESTING/LIN/cchklq.f
@@ -235,7 +235,7 @@
       REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, CERRLQ, CGELQS, CGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, CERRLQ, CGELS, CGET02,
      $                   CLACPY, CLARHS, CLATB4, CLATMS, CLQT01, CLQT02,
      $                   CLQT03, XLAENV
 *     ..
@@ -370,7 +370,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call CGELQS to solve a system
+*                       If M<=N and K=M, call CGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -387,14 +387,20 @@
 *
                            CALL CLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'CGELQS'
-                           CALL CGELQS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from CGELQS.
+*                          Reset AF to the original matrix. CGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL CLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'CGELS'
+                           CALL CGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from CGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'CGELQS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'CGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/cchkqp3rk.f b/lapack-netlib/TESTING/LIN/cchkqp3rk.f
new file mode 100644
index 000000000..79d6add72
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/cchkqp3rk.f
@@ -0,0 +1,836 @@
+*> \brief \b CCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, RWORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER            NM, NN, NNB, NOUT
+*       REAL               THRESH
+*       ..
+*       .. Array Arguments ..
+*       LOGICAL            DOTYPE( * )
+*       INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ),
+*      $                   NXVAL( * )
+*       REAL               S( * ), RWORK( * )
+*       COMPLEX            A( * ), COPYA( * ), TAU( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CCHKQP3RK tests CGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is COMPLEX array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is COMPLEX array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is REAL array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension
+*>                      (max(M*max(M,N) + 4*min(M,N) + max(M,N)))
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (4*NMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex_lin
+*
+*  =====================================================================
+      SUBROUTINE CCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, RWORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      REAL               S( * ), RWORK( * )
+      COMPLEX            A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      REAL               ONE, ZERO, BIGNUM
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0,
+     $                   CZERO = ( 0.0E+0, 0.0E+0 ),
+     $                   CONE = ( 1.0E+0, 0.0E+0 ),
+     $                   BIGNUM = 1.0E+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      REAL               ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      REAL               RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE
+      EXTERNAL           SLAMCH, CQPT01, CQRT11, CQRT12, CLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SLAORD, ICOPY, CAXPY,
+     $                   XLAENV, CGEQP3RK, CLACPY, CLASET, CLATB4,
+     $                   CLATMS, CUNMQR, CSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, MOD, REAL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT, CUNMQR_LWORK
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Complex precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO  I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = SLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with CLATB4 and generate
+*                 M-by-NRHS B matrix with CLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL CLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'CLATMS'
+                  CALL CLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+*
+*                 Check error code from CLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL CLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with DLATB4 and generate a test
+*                 matrix with CLATMS.
+*
+                  CALL CLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'CLATMS'
+                  CALL CLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from CLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL SLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL CLASET( 'Full', M, NB_ZERO, CZERO, CZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL CLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'CLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL CLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from CLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'CLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL CSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL CSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  CALL SLAORD( 'Decreasing', MINMNB_GEN, S, 1 )
+
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for DGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL CLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL CLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL CLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOl = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute CGEQP3RK factorization of A.
+*
+                  SRNAMT = 'CGEQP3RK'
+                  CALL CGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, RWORK, IWORK( 2*N+1 ),
+     $                           INFO )
+*
+*                 Check error code from CGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'CGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                     RESULT( 1 ) = CQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK , RWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = CQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = CQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+*
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL CUNMQR( 'Left', 'Conjugate transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL CAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                    B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                     RESULT( 5 ) =
+     $               ABS(
+     $               CLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( REAL( M )*SLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'CGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of CCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/cchkqr.f b/lapack-netlib/TESTING/LIN/cchkqr.f
index 7ea178eaf..4fa7413f9 100644
--- a/lapack-netlib/TESTING/LIN/cchkqr.f
+++ b/lapack-netlib/TESTING/LIN/cchkqr.f
@@ -244,7 +244,7 @@
       EXTERNAL           CGENND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, CERRQR, CGEQRS, CGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, CERRQR, CGELS, CGET02,
      $                   CLACPY, CLARHS, CLATB4, CLATMS, CQRT01,
      $                   CQRT01P, CQRT02, CQRT03, XLAENV
 *     ..
@@ -371,7 +371,7 @@
                          IF( .NOT. CGENND( M, N, AF, LDA ) )
      $                       RESULT( 9 ) = 2*THRESH
                         NT = NT + 1
-                    ELSE IF( M.GE.N ) THEN
+                     ELSE IF( M.GE.N ) THEN
 *
 *                       Test CUNGQR, using factorization
 *                       returned by CQRT01
@@ -388,7 +388,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call CGEQRS to solve a system
+*                       If M>=N and K=N, call CGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -405,14 +405,20 @@
 *
                            CALL CLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'CGEQRS'
-                           CALL CGEQRS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from CGEQRS.
+*                          Reset AF to the original matrix. CGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL CLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'CGELS'
+                           CALL CGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from CGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'CGEQRS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'CGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/cchktp.f b/lapack-netlib/TESTING/LIN/cchktp.f
index 18242ff54..e14f1062e 100644
--- a/lapack-netlib/TESTING/LIN/cchktp.f
+++ b/lapack-netlib/TESTING/LIN/cchktp.f
@@ -87,7 +87,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f
index 4b09361d8..2953a2bd5 100644
--- a/lapack-netlib/TESTING/LIN/cchktr.f
+++ b/lapack-netlib/TESTING/LIN/cchktr.f
@@ -201,7 +201,8 @@
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      REAL               RESULT( NTESTS ), SCALE3( 2 )
+      REAL               RESULT( NTESTS ), RWORK2( 2*NMAX ),
+     $                   SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -542,10 +543,10 @@
                   SRNAMT = 'CLATRS3'
                   CALL CCOPY( N, X, 1, B, 1 )
                   CALL CCOPY( N, X, 1, B( N+1 ), 1 )
-                  CALL CSCAL( N, BIGNUM, B( N+1 ), 1 )
+                  CALL CSSCAL( N, BIGNUM, B( N+1 ), 1 )
                   CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
-     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
-     $                          INFO )
+     $                          B, MAX(1, N), SCALE3, RWORK, RWORK2,
+     $                          2*NMAX, INFO )
 *
 *                 Check error code from CLATRS3.
 *
diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
index 51cef512d..83e8a17b0 100644
--- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using CHESV_AA.
 *
                      SRNAMT = 'CHESV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX)
                      CALL CHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/cerrhe.f b/lapack-netlib/TESTING/LIN/cerrhe.f
index d0c5cf6b5..1798aed57 100644
--- a/lapack-netlib/TESTING/LIN/cerrhe.f
+++ b/lapack-netlib/TESTING/LIN/cerrhe.f
@@ -133,7 +133,7 @@
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite matrix with patrial
+*        of a Hermitian indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CHETRF
@@ -576,7 +576,7 @@
          CALL CHKXER( 'CHETRS_AA_STAGE', INFOT, NOUT, LERR, OK )
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite packed matrix with patrial
+*        of a Hermitian indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/cerrhex.f b/lapack-netlib/TESTING/LIN/cerrhex.f
index b6c889798..83b3a92ad 100644
--- a/lapack-netlib/TESTING/LIN/cerrhex.f
+++ b/lapack-netlib/TESTING/LIN/cerrhex.f
@@ -137,7 +137,7 @@
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite matrix with patrial
+*        of a Hermitian indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CHETRF
@@ -523,7 +523,7 @@
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *     Test error exits of the routines that use factorization
-*     of a Hermitian indefinite packed matrix with patrial
+*     of a Hermitian indefinite packed matrix with partial
 *     (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CHPTRF
diff --git a/lapack-netlib/TESTING/LIN/cerrlq.f b/lapack-netlib/TESTING/LIN/cerrlq.f
index 1036835b4..495adac0d 100644
--- a/lapack-netlib/TESTING/LIN/cerrlq.f
+++ b/lapack-netlib/TESTING/LIN/cerrlq.f
@@ -76,7 +76,7 @@
      $                   W( NMAX ), X( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CGELQ2, CGELQF, CGELQS, CHKXER, CUNGL2,
+      EXTERNAL           ALAESM, CGELQ2, CGELQF, CHKXER, CUNGL2,
      $                   CUNGLQ, CUNML2, CUNMLQ
 *     ..
 *     .. Scalars in Common ..
@@ -140,31 +140,6 @@
       CALL CGELQ2( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'CGELQ2', INFOT, NOUT, LERR, OK )
 *
-*     CGELQS
-*
-      SRNAMT = 'CGELQS'
-      INFOT = 1
-      CALL CGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL CGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL CGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL CGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL CGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL CGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL CGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGELQS', INFOT, NOUT, LERR, OK )
-*
 *     CUNGLQ
 *
       SRNAMT = 'CUNGLQ'
diff --git a/lapack-netlib/TESTING/LIN/cerrqr.f b/lapack-netlib/TESTING/LIN/cerrqr.f
index 21cf22936..30ce001eb 100644
--- a/lapack-netlib/TESTING/LIN/cerrqr.f
+++ b/lapack-netlib/TESTING/LIN/cerrqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CGEQR2, CGEQR2P, CGEQRF, CGEQRFP,
-     $                   CGEQRS, CHKXER, CUNG2R, CUNGQR, CUNM2R,
+     $                   CHKXER, CUNG2R, CUNGQR, CUNM2R,
      $                   CUNMQR
 *     ..
 *     .. Scalars in Common ..
@@ -170,31 +170,6 @@
       CALL CGEQR2P( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'CGEQR2P', INFOT, NOUT, LERR, OK )
 *
-*     CGEQRS
-*
-      SRNAMT = 'CGEQRS'
-      INFOT = 1
-      CALL CGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL CGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL CGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL CGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL CGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL CGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL CGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'CGEQRS', INFOT, NOUT, LERR, OK )
-*
 *     CUNGQR
 *
       SRNAMT = 'CUNGQR'
diff --git a/lapack-netlib/TESTING/LIN/cerrsy.f b/lapack-netlib/TESTING/LIN/cerrsy.f
index a236d2d18..6d284be98 100644
--- a/lapack-netlib/TESTING/LIN/cerrsy.f
+++ b/lapack-netlib/TESTING/LIN/cerrsy.f
@@ -130,7 +130,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSYTRF
@@ -469,7 +469,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSPTRF
diff --git a/lapack-netlib/TESTING/LIN/cerrsyx.f b/lapack-netlib/TESTING/LIN/cerrsyx.f
index 34972668e..c8d269c84 100644
--- a/lapack-netlib/TESTING/LIN/cerrsyx.f
+++ b/lapack-netlib/TESTING/LIN/cerrsyx.f
@@ -135,7 +135,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSYTRF
@@ -521,7 +521,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        CSPTRF
diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f
index 9ba784f62..ab83357f8 100644
--- a/lapack-netlib/TESTING/LIN/cerrtr.f
+++ b/lapack-netlib/TESTING/LIN/cerrtr.f
@@ -70,7 +70,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      REAL               RCOND, SCALE
+      REAL               RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       REAL               R1( NMAX ), R2( NMAX ), RW( NMAX )
@@ -245,40 +245,40 @@
 *
          SRNAMT = 'CLATRS3'
          INFOT = 1
-         CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 0, INFO )
+         CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 0, INFO )
          CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK )
 *
 *     Test error exits for the packed triangular routines.
diff --git a/lapack-netlib/TESTING/LIN/cerrvx.f b/lapack-netlib/TESTING/LIN/cerrvx.f
index 701abd161..548d09d0c 100644
--- a/lapack-netlib/TESTING/LIN/cerrvx.f
+++ b/lapack-netlib/TESTING/LIN/cerrvx.f
@@ -87,7 +87,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CGBSV, CGBSVX, CGESV, CGESVX, CGTSV, CGTSVX,
-     $                   CHESV, CHESV_RK ,CHESV_ROOK, CHESVX, CHKXER,
+     $                   CHESV, CHESV_RK, CHESV_ROOK, CHESVX, CHKXER,
      $                   CHPSV, CHPSVX, CPBSV, CPBSVX, CPOSV, CPOSVX,
      $                   CPPSV, CPPSVX, CPTSV, CPTSVX, CSPSV, CSPSVX,
      $                   CSYSV, CSYSV_AA, CSYSV_RK, CSYSV_ROOK,
@@ -651,6 +651,9 @@
          INFOT = 3
          CALL CHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_ROOK', INFOT, NOUT, LERR, OK )
@@ -710,9 +713,15 @@
          INFOT = 3
          CALL CHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'CHESV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN
 *
@@ -733,16 +742,44 @@
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+*
+      ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
+*
+*        CSYSV_AASEN
+*
+         SRNAMT = 'CSYSV_AA'
+         INFOT = 1
+         CALL CSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL CSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL CSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'CSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
@@ -763,14 +800,18 @@
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
@@ -834,6 +875,9 @@
          INFOT = 3
          CALL CSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV ', INFOT, NOUT, LERR, OK )
@@ -898,6 +942,9 @@
          INFOT = 3
          CALL CSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL CSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'CSYSV_ROOK', INFOT, NOUT, LERR, OK )
diff --git a/lapack-netlib/TESTING/LIN/cgtt01.f b/lapack-netlib/TESTING/LIN/cgtt01.f
index e504515c5..64dc5f8fa 100644
--- a/lapack-netlib/TESTING/LIN/cgtt01.f
+++ b/lapack-netlib/TESTING/LIN/cgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/cgtt02.f b/lapack-netlib/TESTING/LIN/cgtt02.f
index 702e66eed..8b951acd5 100644
--- a/lapack-netlib/TESTING/LIN/cgtt02.f
+++ b/lapack-netlib/TESTING/LIN/cgtt02.f
@@ -40,14 +40,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X     (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/chet01_3.f b/lapack-netlib/TESTING/LIN/chet01_3.f
index 58104a357..7e07b5790 100644
--- a/lapack-netlib/TESTING/LIN/chet01_3.f
+++ b/lapack-netlib/TESTING/LIN/chet01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/clatb4.f b/lapack-netlib/TESTING/LIN/clatb4.f
index eeb0f03a9..233a8631a 100644
--- a/lapack-netlib/TESTING/LIN/clatb4.f
+++ b/lapack-netlib/TESTING/LIN/clatb4.f
@@ -154,9 +154,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -174,11 +171,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = SLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL SLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -233,6 +225,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -517,17 +613,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/clqt02.f b/lapack-netlib/TESTING/LIN/clqt02.f
index 24eb05399..20297f6cc 100644
--- a/lapack-netlib/TESTING/LIN/clqt02.f
+++ b/lapack-netlib/TESTING/LIN/clqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CLQT02 tests CUNGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, CLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/cptt01.f b/lapack-netlib/TESTING/LIN/cptt01.f
index e4520ec3d..3b117ad28 100644
--- a/lapack-netlib/TESTING/LIN/cptt01.f
+++ b/lapack-netlib/TESTING/LIN/cptt01.f
@@ -36,7 +36,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/cptt02.f b/lapack-netlib/TESTING/LIN/cptt02.f
index da4f0e854..ffaef89bd 100644
--- a/lapack-netlib/TESTING/LIN/cptt02.f
+++ b/lapack-netlib/TESTING/LIN/cptt02.f
@@ -46,7 +46,7 @@
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/cqlt02.f b/lapack-netlib/TESTING/LIN/cqlt02.f
index fc4685aa5..53080cd17 100644
--- a/lapack-netlib/TESTING/LIN/cqlt02.f
+++ b/lapack-netlib/TESTING/LIN/cqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CQLT02 tests CUNGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, CQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/cqpt01.f b/lapack-netlib/TESTING/LIN/cqpt01.f
index 79fc2dc66..149c5bb7c 100644
--- a/lapack-netlib/TESTING/LIN/cqpt01.f
+++ b/lapack-netlib/TESTING/LIN/cqpt01.f
@@ -33,7 +33,8 @@
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) )
+*> where || . || is matrix one norm.
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +173,28 @@
 *
       NORMA = CLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+      DO J = K + 1, N
          CALL CCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL CUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
 *        Compare i-th column of QR and jpvt(i)-th column of A
 *
          CALL CAXPY( M, CMPLX( -ONE ), A( 1, JPVT( J ) ), 1,
      $               WORK( ( J-1 )*M+1 ), 1 )
-   50 CONTINUE
+      END DO
 *
       CQPT01 = CLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/cqrt02.f b/lapack-netlib/TESTING/LIN/cqrt02.f
index 62f176cd8..db22496cf 100644
--- a/lapack-netlib/TESTING/LIN/cqrt02.f
+++ b/lapack-netlib/TESTING/LIN/cqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CQRT02 tests CUNGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, CQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/cqrt11.f b/lapack-netlib/TESTING/LIN/cqrt11.f
index 494d5e9cd..a52084973 100644
--- a/lapack-netlib/TESTING/LIN/cqrt11.f
+++ b/lapack-netlib/TESTING/LIN/cqrt11.f
@@ -157,9 +157,9 @@
       CALL CUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU,
      $             WORK, M, WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       CQRT11 = CLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( REAL( M )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/cqrt12.f b/lapack-netlib/TESTING/LIN/cqrt12.f
index 4c29423ae..0df2d833b 100644
--- a/lapack-netlib/TESTING/LIN/cqrt12.f
+++ b/lapack-netlib/TESTING/LIN/cqrt12.f
@@ -28,7 +28,7 @@
 *> CQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues -s ||/( ||s||*eps*max(M,N) )
 *> \endverbatim
 *
 *  Arguments:
@@ -125,8 +125,8 @@
       EXTERNAL           CLANGE, SASUM, SLAMCH, SNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLABAD,
-     $                   SLASCL, XERBLA
+      EXTERNAL           CGEBD2, CLASCL, CLASET, SAXPY, SBDSQR, SLASCL,
+     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          CMPLX, MAX, MIN, REAL
@@ -153,17 +153,16 @@
 *     Copy upper triangle of A into work
 *
       CALL CLASET( 'Full', M, N, CMPLX( ZERO ), CMPLX( ZERO ), WORK, M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -207,9 +206,9 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             RWORK( I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
diff --git a/lapack-netlib/TESTING/LIN/crqt02.f b/lapack-netlib/TESTING/LIN/crqt02.f
index 8625c2f91..81c805dc4 100644
--- a/lapack-netlib/TESTING/LIN/crqt02.f
+++ b/lapack-netlib/TESTING/LIN/crqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> CRQT02 tests CUNGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, CRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/crzt01.f b/lapack-netlib/TESTING/LIN/crzt01.f
index 6a75a8b31..94fdcb4a9 100644
--- a/lapack-netlib/TESTING/LIN/crzt01.f
+++ b/lapack-netlib/TESTING/LIN/crzt01.f
@@ -158,7 +158,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL CUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL CUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/csyt01_3.f b/lapack-netlib/TESTING/LIN/csyt01_3.f
index f0b316cef..879749a10 100644
--- a/lapack-netlib/TESTING/LIN/csyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/csyt01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL CSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/dchkaa.F b/lapack-netlib/TESTING/LIN/dchkaa.F
index ef9d7808c..6582cac13 100644
--- a/lapack-netlib/TESTING/LIN/dchkaa.F
+++ b/lapack-netlib/TESTING/LIN/dchkaa.F
@@ -63,6 +63,7 @@
 *> DLQ    8               List types on next line if 0 < NTYPES <  8
 *> DQL    8               List types on next line if 0 < NTYPES <  8
 *> DQP    6               List types on next line if 0 < NTYPES <  6
+*> DQK    19              List types on next line if 0 < NTYPES <  19
 *> DTZ    3               List types on next line if 0 < NTYPES <  3
 *> DLS    6               List types on next line if 0 < NTYPES <  6
 *> DEQ
@@ -149,12 +150,12 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      DOUBLE PRECISION   E( NMAX ), S( 2*NMAX )
 *     ..
 *     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK
-      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK      
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: E
+      DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, LSAMEN
@@ -164,13 +165,13 @@
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ,
      $                   DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP,
-     $                   DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP,
-     $                   DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA,
-     $                   DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE,
-     $                   DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT,
-     $                   DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK,
-     $                   DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP,
-     $                   DCHKLQT,DCHKTSQR
+     $                   DCHKPT, DCHKQ3, DCHKQP3RK, DCHKQL, DCHKQR,
+     $                   DCHKRQ, DCHKSP, DCHKSY, DCHKSY_ROOK, DCHKSY_RK,
+     $                   DCHKSY_AA, DCHKTB, DCHKTP, DCHKTR, DCHKTZ,
+     $                   DDRVGB, DDRVGE, DDRVGT, DDRVLS, DDRVPB, DDRVPO,
+     $                   DDRVPP, DDRVPT, DDRVSP, DDRVSY, DDRVSY_ROOK,
+     $                   DDRVSY_RK, DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT,
+     $                   DCHKQRTP, DCHKLQT,DCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -197,6 +198,10 @@
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
       ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *
@@ -919,9 +924,26 @@
          CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
 *
          IF( TSTCHK ) THEN
-            CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL, NXVAL,
-     $                   THRESH, A( 1, 1 ), A( 1, 2 ), B( 1, 1 ),
-     $                   B( 1, 3 ), WORK, IWORK, NOUT )
+            CALL DCHKQ3( DOTYPE, NM, MVAL, NN, NVAL, NNB, NBVAL,
+     $                   NXVAL, THRESH, A( 1, 1 ), A( 1, 2 ),
+     $                   B( 1, 1 ), B( 1, 3 ), WORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                      A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                      B( 1, 3 ), B( 1, 4 ),
+     $                      WORK, IWORK, NOUT )
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
@@ -1054,6 +1076,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (WORK, STAT = AllocateStatus)
       DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/dchklq.f b/lapack-netlib/TESTING/LIN/dchklq.f
index 70af41fe0..a207e0056 100644
--- a/lapack-netlib/TESTING/LIN/dchklq.f
+++ b/lapack-netlib/TESTING/LIN/dchklq.f
@@ -235,7 +235,7 @@
       DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, DERRLQ, DGELQS, DGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DERRLQ, DGELS, DGET02,
      $                   DLACPY, DLARHS, DLATB4, DLATMS, DLQT01, DLQT02,
      $                   DLQT03, XLAENV
 *     ..
@@ -373,7 +373,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call DGELQS to solve a system
+*                       If M<=N and K=M, call DGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -390,14 +390,20 @@
 *
                            CALL DLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'DGELQS'
-                           CALL DGELQS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from DGELQS.
+*                          Reset AF to the original matrix. DGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL DLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'DGELS'
+                           CALL DGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from DGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'DGELQS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'DGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/dchkq3.f b/lapack-netlib/TESTING/LIN/dchkq3.f
index 1fdf07252..494008fa8 100644
--- a/lapack-netlib/TESTING/LIN/dchkq3.f
+++ b/lapack-netlib/TESTING/LIN/dchkq3.f
@@ -30,7 +30,7 @@
 *>
 *> \verbatim
 *>
-*> DCHKQ3 tests DGEQP3.
+*> DCHKQ3 tests  DGEQP3.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/TESTING/LIN/dchkqp3rk.f b/lapack-netlib/TESTING/LIN/dchkqp3rk.f
new file mode 100644
index 000000000..434d2067e
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/dchkqp3rk.f
@@ -0,0 +1,832 @@
+*> \brief \b DCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+*      INTEGER            NM, NN, NNS, NNB, NOUT
+*      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+*      LOGICAL            DOTYPE( * )
+*      INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ),
+*     $                   NVAL( * ), NXVAL( * )
+*      DOUBLE PRECISION   A( * ), COPYA( * ), B( * ), COPYB( * ),
+*     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DCHKQP3RK tests DGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is DOUBLE PRECISION array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is DOUBLE PRECISION array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is DOUBLE PRECISION array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is DOUBLE PRECISION array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension
+*>                      (MMAX*NMAX + 4*NMAX + MMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup double_lin
+*
+*  =====================================================================
+      SUBROUTINE DCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      DOUBLE PRECISION   A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      DOUBLE PRECISION   ONE, ZERO, BIGNUM
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0,
+     $                     BIGNUM = 1.0D+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      DOUBLE PRECISION   ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE,
+     $                   DLAPY2
+      EXTERNAL           DLAMCH, DQPT01, DQRT11, DQRT12, DLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DAXPY, DGEQP3RK,
+     $                   DLACPY, DLAORD, DLASET, DLATB4, DLATMS,
+     $                   DORMQR, DSWAP, ICOPY, XLAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, MAX, MIN, MOD
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Double precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = DLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with DLATB4 and generate
+*                 M-by-NRHS B matrix with DLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL DLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'DLATMS'
+                  CALL DLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+
+
+*
+*                 Check error code from DLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL DLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with DLATB4 and generate a test
+*                 matrix with DLATMS.
+*
+                  CALL DLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'DLATMS'
+                  CALL DLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from DLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL DLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL DLASET( 'Full', M, NB_ZERO, ZERO, ZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL DLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'DLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL DLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from DLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'DLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL DSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL DSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for DGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL DLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL DLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL DLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOL = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute DGEQP3RK factorization of A.
+*
+                  SRNAMT = 'DGEQP3RK'
+                  CALL DGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, IWORK( 2*N+1 ), INFO )
+*
+*                 Check error code from DGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'DGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+                     RESULT( 1 ) = DQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+*
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = DQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = DQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL DORMQR( 'Left', 'Transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL DAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                 B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                   RESULT( 5 ) =
+     $               ABS(
+     $               DLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( DBLE( M )*DLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'DGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of DCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/dchkqr.f b/lapack-netlib/TESTING/LIN/dchkqr.f
index c729e61a9..8188d7a00 100644
--- a/lapack-netlib/TESTING/LIN/dchkqr.f
+++ b/lapack-netlib/TESTING/LIN/dchkqr.f
@@ -244,7 +244,7 @@
       EXTERNAL           DGENND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, DERRQR, DGEQRS, DGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DERRQR, DGELS, DGET02,
      $                   DLACPY, DLARHS, DLATB4, DLATMS, DQRT01,
      $                   DQRT01P, DQRT02, DQRT03, XLAENV
 *     ..
@@ -372,7 +372,7 @@
                          IF( .NOT. DGENND( M, N, AF, LDA ) )
      $                       RESULT( 9 ) = 2*THRESH
                         NT = NT + 1
-                    ELSE IF( M.GE.N ) THEN
+                     ELSE IF( M.GE.N ) THEN
 *
 *                       Test DORGQR, using factorization
 *                       returned by DQRT01
@@ -389,7 +389,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call DGEQRS to solve a system
+*                       If M>=N and K=N, call DGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -406,14 +406,20 @@
 *
                            CALL DLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'DGEQRS'
-                           CALL DGEQRS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from DGEQRS.
+*                          Reset AF. DGELS overwrites the matrix with
+*                          its factorization.
+*
+                           CALL DLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'DGELS'
+                           CALL DGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from DGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'DGEQRS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'DGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
index bc4e77a5a..1940351a4 100644
--- a/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/dchksy_aa_2stage.f
@@ -421,9 +421,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'DSYTRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                   CALL DSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
@@ -503,7 +503,6 @@ c                  NT = 1
                      CALL DLACPY( 'Full', N, NRHS, B, LDA, X, LDA )
 *
                      SRNAMT = 'DSYTRS_AA_2STAGE'
-                     LWORK = MAX( 1, 3*N-2 )
                      CALL DSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
      $                            AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ),
      $                            X, LDA, INFO )
diff --git a/lapack-netlib/TESTING/LIN/dchktp.f b/lapack-netlib/TESTING/LIN/dchktp.f
index 9af6150ca..6db5b1376 100644
--- a/lapack-netlib/TESTING/LIN/dchktp.f
+++ b/lapack-netlib/TESTING/LIN/dchktp.f
@@ -86,7 +86,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/ddrvab.f b/lapack-netlib/TESTING/LIN/ddrvab.f
index 9110d8334..5fd32b951 100644
--- a/lapack-netlib/TESTING/LIN/ddrvab.f
+++ b/lapack-netlib/TESTING/LIN/ddrvab.f
@@ -346,7 +346,7 @@
                CALL DGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK,
      $                      LDA, RWORK, RESULT( 1 ) )
 *
-*              Check if the test passes the tesing.
+*              Check if the test passes the testing.
 *              Print information about the tests that did not
 *              pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/ddrvac.f b/lapack-netlib/TESTING/LIN/ddrvac.f
index bd463cee4..3ecbc6a23 100644
--- a/lapack-netlib/TESTING/LIN/ddrvac.f
+++ b/lapack-netlib/TESTING/LIN/ddrvac.f
@@ -365,7 +365,7 @@
                   CALL DPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK,
      $               LDA, RWORK, RESULT( 1 ) )
 *
-*                 Check if the test passes the tesing.
+*                 Check if the test passes the testing.
 *                 Print information about the tests that did not
 *                 pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
index 91c9e8e9a..d04106ae3 100644
--- a/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/ddrvsy_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using DSYSV_AA.
 *
                      SRNAMT = 'DSYSV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                      CALL DSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/derrlq.f b/lapack-netlib/TESTING/LIN/derrlq.f
index d3cfcddd0..76ff4709e 100644
--- a/lapack-netlib/TESTING/LIN/derrlq.f
+++ b/lapack-netlib/TESTING/LIN/derrlq.f
@@ -76,7 +76,7 @@
      $                   W( NMAX ), X( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, DGELQ2, DGELQF, DGELQS, DORGL2,
+      EXTERNAL           ALAESM, CHKXER, DGELQ2, DGELQF, DORGL2,
      $                   DORGLQ, DORML2, DORMLQ
 *     ..
 *     .. Scalars in Common ..
@@ -140,31 +140,6 @@
       CALL DGELQ2( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'DGELQ2', INFOT, NOUT, LERR, OK )
 *
-*     DGELQS
-*
-      SRNAMT = 'DGELQS'
-      INFOT = 1
-      CALL DGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL DGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL DGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL DGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL DGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL DGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL DGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGELQS', INFOT, NOUT, LERR, OK )
-*
 *     DORGLQ
 *
       SRNAMT = 'DORGLQ'
diff --git a/lapack-netlib/TESTING/LIN/derrqr.f b/lapack-netlib/TESTING/LIN/derrqr.f
index 03155b133..f7e850b80 100644
--- a/lapack-netlib/TESTING/LIN/derrqr.f
+++ b/lapack-netlib/TESTING/LIN/derrqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CHKXER, DGEQR2, DGEQR2P, DGEQRF,
-     $                   DGEQRFP, DGEQRS, DORG2R, DORGQR, DORM2R,
+     $                   DGEQRFP, DORG2R, DORGQR, DORM2R,
      $                   DORMQR
 *     ..
 *     .. Scalars in Common ..
@@ -170,31 +170,6 @@
       CALL DGEQR2P( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'DGEQR2P', INFOT, NOUT, LERR, OK )
 *
-*     DGEQRS
-*
-      SRNAMT = 'DGEQRS'
-      INFOT = 1
-      CALL DGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL DGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL DGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL DGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL DGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL DGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL DGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'DGEQRS', INFOT, NOUT, LERR, OK )
-*
 *     DORGQR
 *
       SRNAMT = 'DORGQR'
diff --git a/lapack-netlib/TESTING/LIN/derrsy.f b/lapack-netlib/TESTING/LIN/derrsy.f
index af15a4b8e..eb08d4c7b 100644
--- a/lapack-netlib/TESTING/LIN/derrsy.f
+++ b/lapack-netlib/TESTING/LIN/derrsy.f
@@ -133,7 +133,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSYTRF
@@ -581,7 +581,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSPTRF
diff --git a/lapack-netlib/TESTING/LIN/derrsyx.f b/lapack-netlib/TESTING/LIN/derrsyx.f
index c2d14caab..495302158 100644
--- a/lapack-netlib/TESTING/LIN/derrsyx.f
+++ b/lapack-netlib/TESTING/LIN/derrsyx.f
@@ -138,7 +138,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSYTRF
@@ -528,7 +528,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        DSPTRF
diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f
index d0580497d..878d9070c 100644
--- a/lapack-netlib/TESTING/LIN/derrtr.f
+++ b/lapack-netlib/TESTING/LIN/derrtr.f
@@ -71,7 +71,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      DOUBLE PRECISION   RCOND, SCALE
+      DOUBLE PRECISION   RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       INTEGER            IW( NMAX )
@@ -250,40 +250,40 @@
 *
          SRNAMT = 'DLATRS3'
          INFOT = 1
-         CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 0, INFO )
+         CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 0, INFO )
          CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/derrvx.f b/lapack-netlib/TESTING/LIN/derrvx.f
index 447b00bc4..f2d29f7a3 100644
--- a/lapack-netlib/TESTING/LIN/derrvx.f
+++ b/lapack-netlib/TESTING/LIN/derrvx.f
@@ -699,21 +699,27 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
 *
-*        DSYSV_AA
-*
-        SRNAMT = 'DSYSV_AA'
-        INFOT = 1
-        CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 2
-        CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 3
-        CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 8
-        CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+*        DSYSV_AASEN
+*
+         SRNAMT = 'DSYSV_AA'
+         INFOT = 1
+         CALL DSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL DSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL DSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL DSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'DSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
@@ -734,14 +740,18 @@
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
diff --git a/lapack-netlib/TESTING/LIN/dgtt01.f b/lapack-netlib/TESTING/LIN/dgtt01.f
index 3b5ff9e4c..6fd754db2 100644
--- a/lapack-netlib/TESTING/LIN/dgtt01.f
+++ b/lapack-netlib/TESTING/LIN/dgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dgtt02.f b/lapack-netlib/TESTING/LIN/dgtt02.f
index b3268b138..4fecff86e 100644
--- a/lapack-netlib/TESTING/LIN/dgtt02.f
+++ b/lapack-netlib/TESTING/LIN/dgtt02.f
@@ -41,14 +41,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X  (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose = Transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dlatb4.f b/lapack-netlib/TESTING/LIN/dlatb4.f
index 8825d13e7..f3bccd45b 100644
--- a/lapack-netlib/TESTING/LIN/dlatb4.f
+++ b/lapack-netlib/TESTING/LIN/dlatb4.f
@@ -133,7 +133,7 @@
 *
 *     .. Parameters ..
       DOUBLE PRECISION   SHRINK, TENTH
-      PARAMETER          ( SHRINK = 0.25D0, TENTH = 0.1D+0 )
+      PARAMETER          ( SHRINK = 0.25D+0, TENTH = 0.1D+0 )
       DOUBLE PRECISION   ONE
       PARAMETER          ( ONE = 1.0D+0 )
       DOUBLE PRECISION   TWO
@@ -153,9 +153,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -173,11 +170,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = DLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL DLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -232,6 +224,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -518,17 +614,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/dlqt02.f b/lapack-netlib/TESTING/LIN/dlqt02.f
index 158cd0b79..672d82bbc 100644
--- a/lapack-netlib/TESTING/LIN/dlqt02.f
+++ b/lapack-netlib/TESTING/LIN/dlqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DLQT02 tests DORGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, DLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/dptt01.f b/lapack-netlib/TESTING/LIN/dptt01.f
index 2c6c440ff..9e9c6bd85 100644
--- a/lapack-netlib/TESTING/LIN/dptt01.f
+++ b/lapack-netlib/TESTING/LIN/dptt01.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dptt02.f b/lapack-netlib/TESTING/LIN/dptt02.f
index a4802c696..e045b8779 100644
--- a/lapack-netlib/TESTING/LIN/dptt02.f
+++ b/lapack-netlib/TESTING/LIN/dptt02.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/dqlt02.f b/lapack-netlib/TESTING/LIN/dqlt02.f
index 950cfe67b..7799e0858 100644
--- a/lapack-netlib/TESTING/LIN/dqlt02.f
+++ b/lapack-netlib/TESTING/LIN/dqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DQLT02 tests DORGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, DQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/dqpt01.f b/lapack-netlib/TESTING/LIN/dqpt01.f
index 8efbdc774..af3f5dd36 100644
--- a/lapack-netlib/TESTING/LIN/dqpt01.f
+++ b/lapack-netlib/TESTING/LIN/dqpt01.f
@@ -28,12 +28,13 @@
 *>
 *> DQPT01 tests the QR-factorization with pivoting of a matrix A.  The
 *> array AF contains the (possibly partial) QR-factorization of A, where
-*> the upper triangle of AF(1:k,1:k) is a partial triangular factor,
-*> the entries below the diagonal in the first k columns are the
+*> the upper triangle of AF(1:K,1:K) is a partial triangular factor,
+*> the entries below the diagonal in the first K columns are the
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) ),
+*> where || . || is matrix one norm.
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +173,41 @@
 *
       NORMA = DLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+*
+*        Copy the upper triangular part of the factor R stored
+*        in AF(1:K,1:K) into the work array WORK.
+*
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+*
+*        Zero out the elements below the diagonal in the work array.
+*
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+*
+*     Copy columns (K+1,N) from AF into the work array WORK.
+*     AF(1:K,K+1:N) contains the rectangular block of the upper trapezoidal
+*     factor R, AF(K+1:M,K+1:N) contains the partially updated residual
+*     matrix of R.
+*
+      DO J = K + 1, N
          CALL DCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL DORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
-*        Compare i-th column of QR and jpvt(i)-th column of A
+*        Compare J-th column of QR and JPVT(J)-th column of A.
 *
          CALL DAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ),
      $               1 )
-   50 CONTINUE
+      END DO
 *
       DQPT01 = DLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/dqrt02.f b/lapack-netlib/TESTING/LIN/dqrt02.f
index d0e4349d1..d41c7b324 100644
--- a/lapack-netlib/TESTING/LIN/dqrt02.f
+++ b/lapack-netlib/TESTING/LIN/dqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DQRT02 tests DORGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, DQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/dqrt11.f b/lapack-netlib/TESTING/LIN/dqrt11.f
index 33c7fab37..38bbeb822 100644
--- a/lapack-netlib/TESTING/LIN/dqrt11.f
+++ b/lapack-netlib/TESTING/LIN/dqrt11.f
@@ -157,9 +157,9 @@
       CALL DORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M,
      $             WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       DQRT11 = DLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( DBLE( M )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/dqrt12.f b/lapack-netlib/TESTING/LIN/dqrt12.f
index 278e01bf0..b8a124c59 100644
--- a/lapack-netlib/TESTING/LIN/dqrt12.f
+++ b/lapack-netlib/TESTING/LIN/dqrt12.f
@@ -26,7 +26,7 @@
 *> DQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues - s ||/(||s||*eps*max(M,N))
 *> \endverbatim
 *
 *  Arguments:
@@ -113,8 +113,7 @@
       EXTERNAL           DASUM, DLAMCH, DLANGE, DNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DAXPY, DBDSQR, DGEBD2, DLABAD, DLASCL, DLASET,
-     $                   XERBLA
+      EXTERNAL           DAXPY, DBDSQR, DGEBD2, DLASCL, DLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE, MAX, MIN
@@ -145,17 +144,16 @@
 *     Copy upper triangle of A into work
 *
       CALL DLASET( 'Full', M, N, ZERO, ZERO, WORK, M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -199,16 +197,18 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             WORK( M*N+I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
 *
       CALL DAXPY( MN, -ONE, S, 1, WORK( M*N+1 ), 1 )
+*
       DQRT12 = DASUM( MN, WORK( M*N+1 ), 1 ) /
-     $         ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) )
+     $  ( DLAMCH('Epsilon') * DBLE( MAX( M, N ) ) )
+*
       IF( NRMSVL.NE.ZERO )
      $   DQRT12 = DQRT12 / NRMSVL
 *
diff --git a/lapack-netlib/TESTING/LIN/drqt02.f b/lapack-netlib/TESTING/LIN/drqt02.f
index 7fbb6a6d4..faf639109 100644
--- a/lapack-netlib/TESTING/LIN/drqt02.f
+++ b/lapack-netlib/TESTING/LIN/drqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> DRQT02 tests DORGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, DRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/drzt01.f b/lapack-netlib/TESTING/LIN/drzt01.f
index 8e969aba7..7a88a2c20 100644
--- a/lapack-netlib/TESTING/LIN/drzt01.f
+++ b/lapack-netlib/TESTING/LIN/drzt01.f
@@ -158,7 +158,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL DORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL DORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/dsyt01_3.f b/lapack-netlib/TESTING/LIN/dsyt01_3.f
index 2a7d9d142..060a9caed 100644
--- a/lapack-netlib/TESTING/LIN/dsyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/dsyt01_3.f
@@ -183,7 +183,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL DSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/schkaa.F b/lapack-netlib/TESTING/LIN/schkaa.F
index a5b826d06..036b13924 100644
--- a/lapack-netlib/TESTING/LIN/schkaa.F
+++ b/lapack-netlib/TESTING/LIN/schkaa.F
@@ -63,6 +63,7 @@
 *> SLQ    8               List types on next line if 0 < NTYPES <  8
 *> SQL    8               List types on next line if 0 < NTYPES <  8
 *> SQP    6               List types on next line if 0 < NTYPES <  6
+*> DQK    19              List types on next line if 0 < NTYPES <  19
 *> STZ    3               List types on next line if 0 < NTYPES <  3
 *> SLS    6               List types on next line if 0 < NTYPES <  6
 *> SEQ
@@ -147,11 +148,11 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      REAL               E( NMAX ), S( 2*NMAX )
 *     ..
 *     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      REAL, DIMENSION(:), ALLOCATABLE :: RWORK
+      REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S
+      REAL, DIMENSION(:), ALLOCATABLE :: E
       REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, WORK
 *     ..
 *     .. External Functions ..
@@ -162,13 +163,13 @@
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ,
      $                   SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP,
-     $                   SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP,
-     $                   SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA,
-     $                   SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE,
-     $                   SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT,
-     $                   SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK,
-     $                   SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP,
-     $                   SCHKLQT, SCHKTSQR
+     $                   SCHKPT, SCHKQ3, SCHKQP3RK, SCHKQL, SCHKQR,
+     $                   SCHKRQ, SCHKSP, SCHKSY, SCHKSY_ROOK, SCHKSY_RK,
+     $                   SCHKSY_AA, SCHKTB, SCHKTP, SCHKTR, SCHKTZ,
+     $                   SDRVGB, SDRVGE, SDRVGT, SDRVLS, SDRVPB, SDRVPO,
+     $                   SDRVPP, SDRVPT, SDRVSP, SDRVSY, SDRVSY_ROOK,
+     $                   SDRVSY_RK, SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT,
+     $                   SCHKQRTP, SCHKLQT, SCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -188,13 +189,17 @@
 *     ..
 *     .. Allocate memory dynamically ..
 *
-      ALLOCATE (A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus )
+      ALLOCATE ( A( ( KDMAX+1 )*NMAX, 7 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus )
+      ALLOCATE ( B( NMAX*MAXRHS, 4 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (WORK( NMAX, NMAX+MAXRHS+30 ) , STAT = AllocateStatus )
+      ALLOCATE ( WORK( NMAX, 3*NMAX+MAXRHS+30 ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus )
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( RWORK( 5*NMAX+2*MAXRHS ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***" 
 *     ..
 *     .. Executable Statements ..
@@ -920,6 +925,23 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                      A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                      B( 1, 3 ), B( 1, 4 ),
+     $                      WORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'TZ' ) ) THEN
 *
@@ -1048,6 +1070,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (WORK, STAT = AllocateStatus)
       DEALLOCATE (RWORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/schklq.f b/lapack-netlib/TESTING/LIN/schklq.f
index cd66e8d10..9335503f9 100644
--- a/lapack-netlib/TESTING/LIN/schklq.f
+++ b/lapack-netlib/TESTING/LIN/schklq.f
@@ -235,7 +235,7 @@
       REAL               RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, SERRLQ, SGELQS, SGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SERRLQ, SGET02,
      $                   SLACPY, SLARHS, SLATB4, SLATMS, SLQT01, SLQT02,
      $                   SLQT03, XLAENV
 *     ..
@@ -370,7 +370,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call SGELQS to solve a system
+*                       If M<=N and K=M, call SGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -387,14 +387,20 @@
 *
                            CALL SLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'SGELQS'
-                           CALL SGELQS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from SGELQS.
+*                          Reset AF to the original matrix. SGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL SLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'SGELS'
+                           CALL SGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from SGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'SGELQS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'SGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/schkqp3rk.f b/lapack-netlib/TESTING/LIN/schkqp3rk.f
new file mode 100644
index 000000000..36cf9370e
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/schkqp3rk.f
@@ -0,0 +1,831 @@
+*> \brief \b SCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+*      INTEGER            NM, NN, NNS, NNB, NOUT
+*      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+*      LOGICAL            DOTYPE( * )
+*      INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NSVAL( * ),
+*     $                   NVAL( * ), NXVAL( * )
+*      REAL               A( * ), COPYA( * ), B( * ), COPYB( * ),
+*     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SCHKQP3RK tests SGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is REAL array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is REAL array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is REAL array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is REAL array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is REAL array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is REAL array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension
+*>                      (MMAX*NMAX + 4*NMAX + MMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup single_lin
+*
+*  =====================================================================
+      SUBROUTINE SCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      REAL               A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   S( * ), TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      REAL               ONE, ZERO, BIGNUM
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0,
+     $                     BIGNUM = 1.0E+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      REAL               ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      REAL               RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE
+      EXTERNAL           SLAMCH, SQPT01, SQRT11, SQRT12, SLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SAXPY, SGEQP3RK,
+     $                   SLACPY, SLAORD, SLASET, SLATB4, SLATMS,
+     $                   SORMQR, SSWAP, ICOPY, XLAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN, MOD, REAL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Single precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = SLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with SLATB4 and generate
+*                 M-by-NRHS B matrix with SLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL SLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'SLATMS'
+                  CALL SLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+
+
+*
+*                 Check error code from SLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL SLASET( 'Full', M, N, ZERO, ZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with SLATB4 and generate a test
+*                 matrix with SLATMS.
+*
+                  CALL SLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'SLATMS'
+                  CALL SLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from SLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL SLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL SLASET( 'Full', M, NB_ZERO, ZERO, ZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL SLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'SLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL SLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from SLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'SLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL SSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL SSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for SGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL SLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL SLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL SLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOL = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute SGEQP3RK factorization of A.
+*
+                  SRNAMT = 'SGEQP3RK'
+                  CALL SGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, IWORK( 2*N+1 ), INFO )
+*
+*                 Check error code from SGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'SGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+                     RESULT( 1 ) = SQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+*
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = SQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = SQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL SORMQR( 'Left', 'Transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL SAXPY( M, -ONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                 B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                   RESULT( 5 ) =
+     $               ABS(
+     $               SLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( REAL( M )*SLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'SGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of SCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/schkqr.f b/lapack-netlib/TESTING/LIN/schkqr.f
index 5c45ede9b..f72c8f1eb 100644
--- a/lapack-netlib/TESTING/LIN/schkqr.f
+++ b/lapack-netlib/TESTING/LIN/schkqr.f
@@ -244,7 +244,7 @@
       EXTERNAL           SGENND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, SERRQR, SGEQRS, SGET02,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, SERRQR, SGELS, SGET02,
      $                   SLACPY, SLARHS, SLATB4, SLATMS, SQRT01,
      $                   SQRT01P, SQRT02, SQRT03, XLAENV
 *     ..
@@ -388,7 +388,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call SGEQRS to solve a system
+*                       If M>=N and K=N, call SGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -405,14 +405,20 @@
 *
                            CALL SLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'SGEQRS'
-                           CALL SGEQRS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from SGEQRS.
+*                          Reset AF to the original matrix. SGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL SLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'SGELS'
+                           CALL SGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from SGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'SGEQRS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'SGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f
index d3c27ae56..6490cd7c3 100644
--- a/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/schksy_aa_2stage.f
@@ -423,9 +423,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'SSYTRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                   CALL SSYTRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
@@ -505,7 +505,6 @@
                      CALL SLACPY( 'Full', N, NRHS, B, LDA, X, LDA )
 *
                      SRNAMT = 'SSYTRS_AA_2STAGE'
-                     LWORK = MAX( 1, 3*N-2 )
                      CALL SSYTRS_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
      $                            AINV, (3*NB+1)*N, IWORK, IWORK( 1+N ),
      $                            X, LDA, INFO )
diff --git a/lapack-netlib/TESTING/LIN/schktp.f b/lapack-netlib/TESTING/LIN/schktp.f
index ff05c1d97..a5243f651 100644
--- a/lapack-netlib/TESTING/LIN/schktp.f
+++ b/lapack-netlib/TESTING/LIN/schktp.f
@@ -86,7 +86,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
index aff32bce9..319b90805 100644
--- a/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/sdrvsy_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using SSYSV_AA.
 *
                      SRNAMT = 'SSYSV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                      CALL SSYSV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/serrlq.f b/lapack-netlib/TESTING/LIN/serrlq.f
index 5bb0fe201..e5df8ce52 100644
--- a/lapack-netlib/TESTING/LIN/serrlq.f
+++ b/lapack-netlib/TESTING/LIN/serrlq.f
@@ -76,7 +76,7 @@
      $                   W( NMAX ), X( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, SGELQ2, SGELQF, SGELQS, SORGL2,
+      EXTERNAL           ALAESM, CHKXER, SGELQ2, SGELQF, SORGL2,
      $                   SORGLQ, SORML2, SORMLQ
 *     ..
 *     .. Scalars in Common ..
@@ -140,31 +140,6 @@
       CALL SGELQ2( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'SGELQ2', INFOT, NOUT, LERR, OK )
 *
-*     SGELQS
-*
-      SRNAMT = 'SGELQS'
-      INFOT = 1
-      CALL SGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL SGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL SGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL SGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL SGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL SGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL SGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGELQS', INFOT, NOUT, LERR, OK )
-*
 *     SORGLQ
 *
       SRNAMT = 'SORGLQ'
diff --git a/lapack-netlib/TESTING/LIN/serrqr.f b/lapack-netlib/TESTING/LIN/serrqr.f
index 1ad40b7aa..e228813f7 100644
--- a/lapack-netlib/TESTING/LIN/serrqr.f
+++ b/lapack-netlib/TESTING/LIN/serrqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CHKXER, SGEQR2, SGEQR2P, SGEQRF,
-     $                   SGEQRFP, SGEQRS, SORG2R, SORGQR, SORM2R,
+     $                   SGEQRFP, SORG2R, SORGQR, SORM2R,
      $                   SORMQR
 *     ..
 *     .. Scalars in Common ..
@@ -170,31 +170,6 @@
       CALL SGEQR2P( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'SGEQR2P', INFOT, NOUT, LERR, OK )
 *
-*     SGEQRS
-*
-      SRNAMT = 'SGEQRS'
-      INFOT = 1
-      CALL SGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL SGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL SGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL SGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL SGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL SGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL SGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'SGEQRS', INFOT, NOUT, LERR, OK )
-*
 *     SORGQR
 *
       SRNAMT = 'SORGQR'
diff --git a/lapack-netlib/TESTING/LIN/serrsy.f b/lapack-netlib/TESTING/LIN/serrsy.f
index c562b417b..ab422ba5c 100644
--- a/lapack-netlib/TESTING/LIN/serrsy.f
+++ b/lapack-netlib/TESTING/LIN/serrsy.f
@@ -133,7 +133,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSYTRF
@@ -581,7 +581,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSPTRF
diff --git a/lapack-netlib/TESTING/LIN/serrsyx.f b/lapack-netlib/TESTING/LIN/serrsyx.f
index ed47c37bd..cffd28a52 100644
--- a/lapack-netlib/TESTING/LIN/serrsyx.f
+++ b/lapack-netlib/TESTING/LIN/serrsyx.f
@@ -137,7 +137,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSYTRF
@@ -527,7 +527,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        SSPTRF
diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f
index af1ce0a8e..391b54c3f 100644
--- a/lapack-netlib/TESTING/LIN/serrtr.f
+++ b/lapack-netlib/TESTING/LIN/serrtr.f
@@ -71,7 +71,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      REAL               RCOND, SCALE
+      REAL               RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       INTEGER            IW( NMAX )
@@ -250,40 +250,40 @@
 *
          SRNAMT = 'SLATRS3'
          INFOT = 1
-         CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W,
-     $                 W( 2 ), 1, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 W, W( 2 ), 1, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W,
-     $                 W( 2 ), 0, INFO )
+         CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 W, W( 2 ), 0, INFO )
          CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN
diff --git a/lapack-netlib/TESTING/LIN/serrvx.f b/lapack-netlib/TESTING/LIN/serrvx.f
index c3db47332..440f9113e 100644
--- a/lapack-netlib/TESTING/LIN/serrvx.f
+++ b/lapack-netlib/TESTING/LIN/serrvx.f
@@ -582,6 +582,9 @@
          INFOT = 3
          CALL SSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL SSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV ', INFOT, NOUT, LERR, OK )
@@ -647,6 +650,9 @@
          INFOT = 3
          CALL SSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL SSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_ROOK', INFOT, NOUT, LERR, OK )
@@ -694,7 +700,7 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
 *
-*        SSYSV_AA
+*        SSYSV_AASEN
 *
          SRNAMT = 'SSYSV_AA'
          INFOT = 1
@@ -706,13 +712,19 @@
          INFOT = 3
          CALL SSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL SSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL SSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'SSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
-*        DSYSV_AASEN_2STAGE
+*        SSYSV_AASEN_2STAGE
 *
          SRNAMT = 'SSYSV_AA_2STAGE'
          INFOT = 1
@@ -729,14 +741,18 @@
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
diff --git a/lapack-netlib/TESTING/LIN/sgtt01.f b/lapack-netlib/TESTING/LIN/sgtt01.f
index 5d88c91f7..5d639af99 100644
--- a/lapack-netlib/TESTING/LIN/sgtt01.f
+++ b/lapack-netlib/TESTING/LIN/sgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/sgtt02.f b/lapack-netlib/TESTING/LIN/sgtt02.f
index a4eb09f6b..6daea295f 100644
--- a/lapack-netlib/TESTING/LIN/sgtt02.f
+++ b/lapack-netlib/TESTING/LIN/sgtt02.f
@@ -41,14 +41,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X  (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose = Transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/slatb4.f b/lapack-netlib/TESTING/LIN/slatb4.f
index 94d29db40..72a310727 100644
--- a/lapack-netlib/TESTING/LIN/slatb4.f
+++ b/lapack-netlib/TESTING/LIN/slatb4.f
@@ -153,9 +153,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -173,11 +170,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = SLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL SLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -232,6 +224,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -518,17 +614,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/slqt02.f b/lapack-netlib/TESTING/LIN/slqt02.f
index d59a8a01c..1c7bd4f0d 100644
--- a/lapack-netlib/TESTING/LIN/slqt02.f
+++ b/lapack-netlib/TESTING/LIN/slqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SLQT02 tests SORGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, SLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/sptt01.f b/lapack-netlib/TESTING/LIN/sptt01.f
index fef18dd2b..5b33bfad5 100644
--- a/lapack-netlib/TESTING/LIN/sptt01.f
+++ b/lapack-netlib/TESTING/LIN/sptt01.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/sptt02.f b/lapack-netlib/TESTING/LIN/sptt02.f
index 51a6a074a..2bd5535aa 100644
--- a/lapack-netlib/TESTING/LIN/sptt02.f
+++ b/lapack-netlib/TESTING/LIN/sptt02.f
@@ -35,7 +35,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/sqlt02.f b/lapack-netlib/TESTING/LIN/sqlt02.f
index 5d381b950..f26352eb3 100644
--- a/lapack-netlib/TESTING/LIN/sqlt02.f
+++ b/lapack-netlib/TESTING/LIN/sqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SQLT02 tests SORGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, SQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/sqpt01.f b/lapack-netlib/TESTING/LIN/sqpt01.f
index de0c80e53..f53686a65 100644
--- a/lapack-netlib/TESTING/LIN/sqpt01.f
+++ b/lapack-netlib/TESTING/LIN/sqpt01.f
@@ -33,7 +33,8 @@
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) )
+*> where || . || is matrix one norm.
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +173,28 @@
 *
       NORMA = SLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+      DO J = K + 1, N
          CALL SCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL SORMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
 *        Compare i-th column of QR and jpvt(i)-th column of A
 *
          CALL SAXPY( M, -ONE, A( 1, JPVT( J ) ), 1, WORK( ( J-1 )*M+1 ),
      $               1 )
-   50 CONTINUE
+      END DO
 *
       SQPT01 = SLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( REAL( MAX( M, N ) )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/sqrt02.f b/lapack-netlib/TESTING/LIN/sqrt02.f
index 72163f0a9..44b9c6270 100644
--- a/lapack-netlib/TESTING/LIN/sqrt02.f
+++ b/lapack-netlib/TESTING/LIN/sqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SQRT02 tests SORGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, SQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/sqrt11.f b/lapack-netlib/TESTING/LIN/sqrt11.f
index d4422dacb..a3753adcf 100644
--- a/lapack-netlib/TESTING/LIN/sqrt11.f
+++ b/lapack-netlib/TESTING/LIN/sqrt11.f
@@ -157,9 +157,9 @@
       CALL SORM2R( 'Left', 'Transpose', M, M, K, A, LDA, TAU, WORK, M,
      $             WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       SQRT11 = SLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( REAL( M )*SLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/sqrt12.f b/lapack-netlib/TESTING/LIN/sqrt12.f
index 2eab0ee0d..46b359e07 100644
--- a/lapack-netlib/TESTING/LIN/sqrt12.f
+++ b/lapack-netlib/TESTING/LIN/sqrt12.f
@@ -26,7 +26,7 @@
 *> SQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues - s ||/(||s||*eps*max(M,N))
 *> \endverbatim
 *
 *  Arguments:
@@ -113,8 +113,7 @@
       EXTERNAL           SASUM, SLAMCH, SLANGE, SNRM2
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SAXPY, SBDSQR, SGEBD2, SLABAD, SLASCL, SLASET,
-     $                   XERBLA
+      EXTERNAL           SAXPY, SBDSQR, SGEBD2, SLASCL, SLASET, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          MAX, MIN, REAL
@@ -145,17 +144,16 @@
 *     Copy upper triangle of A into work
 *
       CALL SLASET( 'Full', M, N, ZERO, ZERO, WORK, M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = SLAMCH( 'S' ) / SLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL SLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -199,9 +197,9 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             WORK( M*N+I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
diff --git a/lapack-netlib/TESTING/LIN/srqt02.f b/lapack-netlib/TESTING/LIN/srqt02.f
index ca0594f7a..a33c98ba4 100644
--- a/lapack-netlib/TESTING/LIN/srqt02.f
+++ b/lapack-netlib/TESTING/LIN/srqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> SRQT02 tests SORGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, SRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/srzt01.f b/lapack-netlib/TESTING/LIN/srzt01.f
index 4478c0506..a2b8b29ba 100644
--- a/lapack-netlib/TESTING/LIN/srzt01.f
+++ b/lapack-netlib/TESTING/LIN/srzt01.f
@@ -158,7 +158,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL SORMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL SORMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/ssyt01_3.f b/lapack-netlib/TESTING/LIN/ssyt01_3.f
index 147553db9..951fcb7d6 100644
--- a/lapack-netlib/TESTING/LIN/ssyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/ssyt01_3.f
@@ -183,7 +183,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL SSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkaa.F b/lapack-netlib/TESTING/LIN/zchkaa.F
index a118515a5..f1020f2d8 100644
--- a/lapack-netlib/TESTING/LIN/zchkaa.F
+++ b/lapack-netlib/TESTING/LIN/zchkaa.F
@@ -69,6 +69,7 @@
 *> ZLQ    8               List types on next line if 0 < NTYPES <  8
 *> ZQL    8               List types on next line if 0 < NTYPES <  8
 *> ZQP    6               List types on next line if 0 < NTYPES <  6
+*> ZQK   19               List types on next line if 0 < NTYPES < 19
 *> ZTZ    3               List types on next line if 0 < NTYPES <  3
 *> ZLS    6               List types on next line if 0 < NTYPES <  6
 *> ZEQ
@@ -153,12 +154,11 @@
      $                   NBVAL( MAXIN ), NBVAL2( MAXIN ),
      $                   NSVAL( MAXIN ), NVAL( MAXIN ), NXVAL( MAXIN ),
      $                   RANKVAL( MAXIN ), PIV( NMAX )
-      DOUBLE PRECISION   S( 2*NMAX )
-      COMPLEX*16         E( NMAX )
-*
-*    .. Allocatable Arrays ..
+*     ..
+*     .. Allocatable Arrays ..
       INTEGER AllocateStatus
-      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE::   RWORK
+      DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE::  RWORK, S
+      COMPLEX*16, DIMENSION(:), ALLOCATABLE :: E
       COMPLEX*16, DIMENSION(:,:), ALLOCATABLE::  A, B, WORK
 *     ..
 *     .. External Functions ..
@@ -170,15 +170,16 @@
       EXTERNAL           ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE,
      $                   ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP,
      $                   ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS,
-     $                   ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ,
-     $                   ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK,
-     $                   ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ,
-     $                   ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK,
-     $                   ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP,
-     $                   ZDRVLS, ZDRVPB,  ZDRVPO, ZDRVPP, ZDRVPT,
-     $                   ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK,
-     $                   ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT,
-     $                   ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR
+     $                   ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQP3RK, ZCHKQL,
+     $                   ZCHKQR, ZCHKRQ, ZCHKSP, ZCHKSY, ZCHKSY_ROOK,
+     $                   ZCHKSY_RK, ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR,
+     $                   ZCHKTZ, ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE,
+     $                   ZDRVHE_ROOK, ZDRVHE_RK, ZDRVHE_AA,
+     $                   ZDRVHE_AA_2STAGE, ZDRVHP, ZDRVLS, ZDRVPB,
+     $                   ZDRVPO, ZDRVPP, ZDRVPT, ZDRVSP, ZDRVSY,
+     $                   ZDRVSY_ROOK, ZDRVSY_RK, ZDRVSY_AA,
+     $                   ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, ZCHKQRTP,
+     $                   ZCHKLQT, ZCHKLQTP, ZCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -197,13 +198,18 @@
       DATA               THREQ / 2.0D0 / , INTSTR / '0123456789' /
 *
 *     .. Allocate memory dynamically ..
-      ALLOCATE (RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus)
+*
+      ALLOCATE ( A ( (KDMAX+1) * NMAX, 7 ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
+      ALLOCATE ( B ( NMAX * MAXRHS, 4 ), STAT = AllocateStatus)
+      IF (AllocateStatus /= 0 ) STOP "*** Not enough memory ***"
+      ALLOCATE ( WORK ( NMAX, NMAX+MAXRHS+10 ), STAT = AllocateStatus)
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (A ((KDMAX+1) * NMAX, 7), STAT = AllocateStatus)
+      ALLOCATE ( E( NMAX ), STAT = AllocateStatus )
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (B (NMAX * MAXRHS, 4), STAT = AllocateStatus)
+      ALLOCATE ( S( 2*NMAX ), STAT = AllocateStatus)
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
-      ALLOCATE (WORK (NMAX, NMAX+MAXRHS+10), STAT = AllocateStatus)
+      ALLOCATE ( RWORK( 150*NMAX+2*MAXRHS ), STAT = AllocateStatus)
       IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
 *     ..
 *     .. Executable Statements ..
@@ -1109,6 +1115,23 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        QK: truncated QR factorization with pivoting
+*
+         NTYPES = 19
+         CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
+*
+         IF( TSTCHK ) THEN
+            CALL ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                   NNB, NBVAL, NXVAL, THRESH, A( 1, 1 ),
+     $                   A( 1, 2 ), B( 1, 1 ), B( 1, 2 ),
+     $                   S( 1 ), B( 1, 4 ),
+     $                   WORK, RWORK, IWORK, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 )PATH
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'LS' ) ) THEN
 *
@@ -1245,6 +1268,8 @@
 *
       DEALLOCATE (A, STAT = AllocateStatus)
       DEALLOCATE (B, STAT = AllocateStatus)
+      DEALLOCATE (E, STAT = AllocateStatus)
+      DEALLOCATE (S, STAT = AllocateStatus)
       DEALLOCATE (RWORK, STAT = AllocateStatus)
       DEALLOCATE (WORK,  STAT = AllocateStatus)
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f
index 381fac9f2..51082f1d0 100644
--- a/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zchkhe_aa_2stage.f
@@ -8,7 +8,7 @@
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL, 
+*       SUBROUTINE ZCHKHE_AA_2STAGE( DOTYPE, NN, NVAL, NNB, NBVAL,
 *                             NNS, NSVAL, THRESH, TSTERR, NMAX, A,
 *                             AFAC, AINV, B, X, XACT, WORK, RWORK,
 *                             IWORK, NOUT )
@@ -185,7 +185,8 @@
       LOGICAL            DOTYPE( * )
       INTEGER            IWORK( * ), NBVAL( * ), NSVAL( * ), NVAL( * )
       COMPLEX*16         A( * ), AFAC( * ), AINV( * ), B( * ),
-     $                   RWORK( * ), WORK( * ), X( * ), XACT( * )
+     $                   WORK( * ), X( * ), XACT( * )
+      DOUBLE PRECISION   RWORK( * )
 *     ..
 *
 *  =====================================================================
@@ -430,9 +431,9 @@
 *                 block factorization, LWORK is the length of AINV.
 *
                   SRNAMT = 'ZHETRF_AA_2STAGE'
-                  LWORK = MIN(N*NB, 3*NMAX*NMAX)
-                  CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA, 
-     $                                   AINV, (3*NB+1)*N, 
+                  LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
+                  CALL ZHETRF_AA_2STAGE( UPLO, N, AFAC, LDA,
+     $                                   AINV, MAX( 1, (3*NB+1)*N ),
      $                                   IWORK, IWORK( 1+N ),
      $                                   WORK, LWORK,
      $                                   INFO )
diff --git a/lapack-netlib/TESTING/LIN/zchklq.f b/lapack-netlib/TESTING/LIN/zchklq.f
index 371bb946b..ccef7b803 100644
--- a/lapack-netlib/TESTING/LIN/zchklq.f
+++ b/lapack-netlib/TESTING/LIN/zchklq.f
@@ -235,7 +235,7 @@
       DOUBLE PRECISION   RESULT( NTESTS )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZERRLQ, ZGELQS,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZERRLQ, ZGELS,
      $                   ZGET02, ZLACPY, ZLARHS, ZLATB4, ZLATMS, ZLQT01,
      $                   ZLQT02, ZLQT03
 *     ..
@@ -370,7 +370,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call ZGELQS to solve a system
+*                       If M<=N and K=M, call ZGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -387,14 +387,20 @@
 *
                            CALL ZLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'ZGELQS'
-                           CALL ZGELQS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from ZGELQS.
+*                          Reset AF to the original matrix. ZGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL ZLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'ZGELS'
+                           CALL ZGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from ZGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'ZGELQS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'ZGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkqp3rk.f b/lapack-netlib/TESTING/LIN/zchkqp3rk.f
new file mode 100644
index 000000000..302c7b1a8
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/zchkqp3rk.f
@@ -0,0 +1,836 @@
+*> \brief \b ZCHKQP3RK
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+*     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+*     $                      B, COPYB, S, TAU,
+*     $                      WORK, RWORK, IWORK, NOUT )
+*      IMPLICIT NONE
+*
+*       .. Scalar Arguments ..
+*       INTEGER            NM, NN, NNB, NOUT
+*       DOUBLE PRECISION   THRESH
+*       ..
+*       .. Array Arguments ..
+*       LOGICAL            DOTYPE( * )
+*       INTEGER            IWORK( * ), MVAL( * ), NBVAL( * ), NVAL( * ),
+*      $                   NXVAL( * )
+*       DOUBLE PRECISION   S( * ), RWORK( * )
+*       COMPLEX*16         A( * ), COPYA( * ), TAU( * ), WORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZCHKQP3RK tests ZGEQP3RK.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] DOTYPE
+*> \verbatim
+*>          DOTYPE is LOGICAL array, dimension (NTYPES)
+*>          The matrix types to be used for testing.  Matrices of type j
+*>          (for 1 <= j <= NTYPES) are used for testing if DOTYPE(j) =
+*>          .TRUE.; if DOTYPE(j) = .FALSE., then type j is not used.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNS
+*> \verbatim
+*>          NNS is INTEGER
+*>          The number of values of NRHS contained in the vector NSVAL.
+*> \endverbatim
+*>
+*> \param[in] NSVAL
+*> \verbatim
+*>          NSVAL is INTEGER array, dimension (NNS)
+*>          The values of the number of right hand sides NRHS.
+*> \endverbatim
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB and NX contained in the
+*>          vectors NBVAL and NXVAL.  The blocking parameters are used
+*>          in pairs (NB,NX).
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NNB)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NXVAL
+*> \verbatim
+*>          NXVAL is INTEGER array, dimension (NNB)
+*>          The values of the crossover point NX.
+*> \endverbatim
+*>
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (MMAX*NMAX)
+*>          where MMAX is the maximum value of M in MVAL and NMAX is the
+*>          maximum value of N in NVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYA
+*> \verbatim
+*>          COPYA is COMPLEX*16 array, dimension (MMAX*NMAX)
+*> \endverbatim
+*>
+*> \param[out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (MMAX*NSMAX)
+*>          where MMAX is the maximum value of M in MVAL and NSMAX is the
+*>          maximum value of NRHS in NSVAL.
+*> \endverbatim
+*>
+*> \param[out] COPYB
+*> \verbatim
+*>          COPYB is COMPLEX*16 array, dimension (MMAX*NSMAX)
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is DOUBLE PRECISION array, dimension
+*>                      (min(MMAX,NMAX))
+*> \endverbatim
+*>
+*> \param[out] TAU
+*> \verbatim
+*>          TAU is COMPLEX*16 array, dimension (MMAX)
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension
+*>                      (max(M*max(M,N) + 4*min(M,N) + max(M,N)))
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (4*NMAX)
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (2*NMAX)
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16_lin
+*
+*  =====================================================================
+      SUBROUTINE ZCHKQP3RK( DOTYPE, NM, MVAL, NN, NVAL, NNS, NSVAL,
+     $                      NNB, NBVAL, NXVAL, THRESH, A, COPYA,
+     $                      B, COPYB, S, TAU,
+     $                      WORK, RWORK, IWORK, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      INTEGER            NM, NN, NNB, NNS, NOUT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      LOGICAL            DOTYPE( * )
+      INTEGER            IWORK( * ), NBVAL( * ), MVAL( * ), NVAL( * ),
+     $                   NSVAL( * ), NXVAL( * )
+      DOUBLE PRECISION   S( * ), RWORK( * )
+      COMPLEX*16         A( * ), COPYA( * ), B( * ), COPYB( * ),
+     $                   TAU( * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTYPES
+      PARAMETER          ( NTYPES = 19 )
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 5 )
+      DOUBLE PRECISION   ONE, ZERO, BIGNUM
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0,
+     $                   CZERO = ( 0.0D+0, 0.0D+0 ),
+     $                   CONE = ( 1.0D+0, 0.0D+0 ),
+     $                   BIGNUM = 1.0D+38 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER          DIST, TYPE
+      CHARACTER*3        PATH
+      INTEGER            I, IHIGH, ILOW, IM, IMAT, IN, INC_ZERO,
+     $                   INB, IND_OFFSET_GEN,
+     $                   IND_IN, IND_OUT, INS, INFO,
+     $                   ISTEP, J, J_INC, J_FIRST_NZ, JB_ZERO,
+     $                   KFACT, KL, KMAX, KU, LDA, LW, LWORK,
+     $                   LWORK_MQR, M, MINMN, MINMNB_GEN, MODE, N,
+     $                   NB, NB_ZERO, NERRS, NFAIL, NB_GEN, NRHS,
+     $                   NRUN, NX, T
+      DOUBLE PRECISION   ANORM, CNDNUM, EPS, ABSTOL, RELTOL,
+     $                   DTEMP, MAXC2NRMK, RELMAXC2NRMK
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 ), ISEEDY( 4 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RDUMMY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE
+      EXTERNAL           DLAMCH, ZQPT01, ZQRT11, ZQRT12, ZLANGE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAERH, ALAHD, ALASUM, DLAORD, ICOPY, ZAXPY,
+     $                   XLAENV, ZGEQP3RK, ZLACPY, ZLASET, ZLATB4,
+     $                   ZLATMS, ZUNMQR, ZSWAP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, MAX, MIN, MOD
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER*32       SRNAMT
+      INTEGER            INFOT, IOUNIT, ZUNMQR_LWORK
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, IOUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA               ISEEDY / 1988, 1989, 1990, 1991 /
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants and the random number seed.
+*
+      PATH( 1: 1 ) = 'Zomplex precision'
+      PATH( 2: 3 ) = 'QK'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+      DO  I = 1, 4
+         ISEED( I ) = ISEEDY( I )
+      END DO
+      EPS = DLAMCH( 'Epsilon' )
+      INFOT = 0
+*
+      DO IM = 1, NM
+*
+*        Do for each value of M in MVAL.
+*
+         M = MVAL( IM )
+         LDA = MAX( 1, M )
+*
+         DO IN = 1, NN
+*
+*           Do for each value of N in NVAL.
+*
+            N = NVAL( IN )
+            MINMN = MIN( M, N )
+            LWORK = MAX( 1, M*MAX( M, N )+4*MINMN+MAX( M, N ),
+     $                   M*N + 2*MINMN + 4*N )
+*
+            DO INS = 1, NNS
+               NRHS = NSVAL( INS )
+*
+*                 Set up parameters with ZLATB4 and generate
+*                 M-by-NRHS B matrix with ZLATMS.
+*                 IMAT = 14:
+*                 Random matrix, CNDNUM = 2, NORM = ONE,
+*                 MODE = 3 (geometric distribution of singular values).
+*
+                  CALL ZLATB4( PATH, 14, M, NRHS, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'ZLATMS'
+                  CALL ZLATMS( M, NRHS, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYB, LDA, WORK, INFO )
+*
+*                 Check error code from ZLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M,
+     $                            NRHS, -1, -1, -1, 6, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+               DO IMAT = 1, NTYPES
+*
+*              Do the tests only if DOTYPE( IMAT ) is true.
+*
+               IF( .NOT.DOTYPE( IMAT ) )
+     $            CYCLE
+*
+*              The type of distribution used to generate the random
+*              eigen-/singular values:
+*              ( 'S' for symmetric distribution ) => UNIFORM( -1, 1 )
+*
+*           Do for each type of NON-SYMMETRIC matrix:                               CNDNUM                     NORM                                     MODE
+*            1. Zero matrix
+*            2. Random, Diagonal, CNDNUM = 2                                        CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            3. Random, Upper triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            4. Random, Lower triangular, CNDNUM = 2                                CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            5. Random, First column is zero, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            6. Random, Last MINMN column is zero, CNDNUM = 2                       CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            7. Random, Last N column is zero, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            8. Random, Middle column in MINMN is zero, CNDNUM = 2                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*            9. Random, First half of MINMN columns are zero, CNDNUM = 2            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           10. Random, Last columns are zero starting from MINMN/2+1, CNDNUM = 2   CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           11. Random, Half MINMN columns in the middle are zero starting
+*                  from  MINMN/2-(MINMN/2)/2+1, CNDNUM = 2                          CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           12. Random, Odd columns are ZERO, CNDNUM = 2                            CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           13. Random, Even columns are ZERO, CNDNUM = 2                           CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           14. Random, CNDNUM = 2                                                  CNDNUM = 2                      ONE                                      3 ( geometric distribution of singular values )
+*           15. Random, CNDNUM = sqrt(0.1/EPS)                                      CNDNUM = BADC1 = sqrt(0.1/EPS)  ONE                                      3 ( geometric distribution of singular values )
+*           16. Random, CNDNUM = 0.1/EPS                                            CNDNUM = BADC2 = 0.1/EPS        ONE                                      3 ( geometric distribution of singular values )
+*           17. Random, CNDNUM = 0.1/EPS,                                           CNDNUM = BADC2 = 0.1/EPS        ONE                                      2 ( one small singular value, S(N)=1/CNDNUM )
+*                 one small singular value S(N)=1/CNDNUM
+*           18. Random, CNDNUM = 2, scaled near underflow                           CNDNUM = 2                      SMALL = SAFMIN
+*           19. Random, CNDNUM = 2, scaled near overflow                            CNDNUM = 2                      LARGE = 1.0/( 0.25 * ( SAFMIN / EPS ) )  3 ( geometric distribution of singular values )
+*
+               IF( IMAT.EQ.1 ) THEN
+*
+*                 Matrix 1: Zero matrix
+*
+                  CALL ZLASET( 'Full', M, N, CZERO, CZERO, COPYA, LDA )
+                  DO I = 1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE IF( (IMAT.GE.2 .AND. IMAT.LE.4 )
+     $                  .OR. (IMAT.GE.14 .AND. IMAT.LE.19 ) ) THEN
+*
+*                 Matrices 2-5.
+*
+*                 Set up parameters with DLATB4 and generate a test
+*                 matrix with ZLATMS.
+*
+                  CALL ZLATB4( PATH, IMAT, M, N, TYPE, KL, KU, ANORM,
+     $                         MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'ZLATMS'
+                  CALL ZLATMS( M, N, DIST, ISEED, TYPE, S, MODE,
+     $                         CNDNUM, ANORM, KL, KU, 'No packing',
+     $                         COPYA, LDA, WORK, INFO )
+*
+*                 Check error code from ZLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M, N,
+     $                            -1, -1, -1, IMAT, NFAIL, NERRS,
+     $                            NOUT )
+                     CYCLE
+                  END IF
+*
+                  CALL DLAORD( 'Decreasing', MINMN, S, 1 )
+*
+               ELSE IF( MINMN.GE.2
+     $                  .AND. IMAT.GE.5 .AND. IMAT.LE.13 ) THEN
+*
+*                 Rectangular matrices 5-13 that contain zero columns,
+*                 only for matrices MINMN >=2.
+*
+*                 JB_ZERO is the column index of ZERO block.
+*                 NB_ZERO is the column block size of ZERO block.
+*                 NB_GEN is the column blcok size of the
+*                 generated block.
+*                 J_INC in the non_zero column index increment
+*                 for matrix 12 and 13.
+*                 J_FIRS_NZ is the index of the first non-zero
+*                 column.
+*
+                  IF( IMAT.EQ.5 ) THEN
+*
+*                    First column is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.6 ) THEN
+*
+*                    Last column MINMN is zero.
+*
+                     JB_ZERO = MINMN
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.7 ) THEN
+*
+*                    Last column N is zero.
+*
+                     JB_ZERO = N
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.8 ) THEN
+*
+*                    Middle column in MINMN is zero.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.9 ) THEN
+*
+*                    First half of MINMN columns is zero.
+*
+                     JB_ZERO = 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.10 ) THEN
+*
+*                    Last columns are zero columns,
+*                    starting from (MINMN / 2 + 1) column.
+*
+                     JB_ZERO = MINMN / 2 + 1
+                     NB_ZERO = N - JB_ZERO + 1
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.11 ) THEN
+*
+*                    Half of the columns in the middle of MINMN
+*                    columns is zero, starting from
+*                    MINMN/2 - (MINMN/2)/2 + 1 column.
+*
+                     JB_ZERO = MINMN / 2 - (MINMN / 2) / 2 + 1
+                     NB_ZERO = MINMN / 2
+                     NB_GEN = N - NB_ZERO
+*
+                  ELSE IF( IMAT.EQ.12 ) THEN
+*
+*                    Odd-numbered columns are zero,
+*
+                     NB_GEN = N / 2
+                     NB_ZERO = N - NB_GEN
+                     J_INC = 2
+                     J_FIRST_NZ = 2
+*
+                  ELSE IF( IMAT.EQ.13 ) THEN
+*
+*                    Even-numbered columns are zero.
+*
+                     NB_ZERO = N / 2
+                     NB_GEN = N - NB_ZERO
+                     J_INC = 2
+                     J_FIRST_NZ = 1
+*
+                  END IF
+*
+*
+*                 1) Set the first NB_ZERO columns in COPYA(1:M,1:N)
+*                    to zero.
+*
+                  CALL ZLASET( 'Full', M, NB_ZERO, CZERO, CZERO,
+     $                         COPYA, LDA )
+*
+*                    2) Generate an M-by-(N-NB_ZERO) matrix with the
+*                       chosen singular value distribution
+*                       in COPYA(1:M,NB_ZERO+1:N).
+*
+                  CALL ZLATB4( PATH, IMAT, M, NB_GEN, TYPE, KL, KU,
+     $                         ANORM, MODE, CNDNUM, DIST )
+*
+                  SRNAMT = 'ZLATMS'
+*
+                  IND_OFFSET_GEN = NB_ZERO * LDA
+*
+                  CALL ZLATMS( M, NB_GEN, DIST, ISEED, TYPE, S, MODE,
+     $                        CNDNUM, ANORM, KL, KU, 'No packing',
+     $                        COPYA( IND_OFFSET_GEN + 1 ), LDA,
+     $                        WORK, INFO )
+*
+*                 Check error code from ZLATMS.
+*
+                  IF( INFO.NE.0 ) THEN
+                     CALL ALAERH( PATH, 'ZLATMS', INFO, 0, ' ', M,
+     $                            NB_GEN, -1, -1, -1, IMAT, NFAIL,
+     $                            NERRS, NOUT )
+                     CYCLE
+                  END IF
+*
+*                 3) Swap the gererated colums from the right side
+*                 NB_GEN-size block in COPYA into correct column
+*                 positions.
+*
+                  IF( IMAT.EQ.6
+     $                    .OR. IMAT.EQ.7
+     $                    .OR. IMAT.EQ.8
+     $                    .OR. IMAT.EQ.10
+     $                    .OR. IMAT.EQ.11 ) THEN
+*
+*                    Move by swapping the generated columns
+*                    from the right NB_GEN-size block from
+*                    (NB_ZERO+1:NB_ZERO+JB_ZERO)
+*                    into columns (1:JB_ZERO-1).
+*
+                     DO J = 1, JB_ZERO-1, 1
+                        CALL ZSWAP( M,
+     $                        COPYA( ( NB_ZERO+J-1)*LDA+1), 1,
+     $                        COPYA( (J-1)*LDA + 1 ), 1 )
+                     END DO
+*
+                  ELSE IF( IMAT.EQ.12 .OR. IMAT.EQ.13 ) THEN
+*
+*                    ( IMAT = 12, Odd-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the even zero colums in the
+*                    left NB_ZERO-size block.
+*
+*                    ( IMAT = 13, Even-numbered ZERO columns. )
+*                    Swap the generated columns from the right
+*                    NB_GEN-size block into the odd zero colums in the
+*                    left NB_ZERO-size block.
+*
+                     DO J = 1, NB_GEN, 1
+                        IND_OUT = ( NB_ZERO+J-1 )*LDA + 1
+                        IND_IN = ( J_INC*(J-1)+(J_FIRST_NZ-1) )*LDA
+     $                            + 1
+                        CALL ZSWAP( M,
+     $                              COPYA( IND_OUT ), 1,
+     $                              COPYA( IND_IN), 1 )
+                        END DO
+*
+                  END IF
+*
+*                 5) Order the singular values generated by
+*                    DLAMTS in decreasing order and add trailing zeros
+*                    that correspond to zero columns.
+*                    The total number of singular values is MINMN.
+*
+                  MINMNB_GEN = MIN( M, NB_GEN )
+*
+                  CALL DLAORD( 'Decreasing', MINMNB_GEN, S, 1 )
+
+                  DO I = MINMNB_GEN+1, MINMN
+                     S( I ) = ZERO
+                  END DO
+*
+               ELSE
+*
+*                    IF(MINMN.LT.2) skip this size for this matrix type.
+*
+                     CYCLE
+               END IF
+*
+*              Initialize a copy array for a pivot array for DGEQP3RK.
+*
+               DO I = 1, N
+                  IWORK( I ) = 0
+               END DO
+*
+               DO INB = 1, NNB
+*
+*                 Do for each pair of values (NB,NX) in NBVAL and NXVAL.
+*
+                  NB = NBVAL( INB )
+                  CALL XLAENV( 1, NB )
+                  NX = NXVAL( INB )
+                  CALL XLAENV( 3, NX )
+*
+*                 We do MIN(M,N)+1 because we need a test for KMAX > N,
+*                 when KMAX is larger than MIN(M,N), KMAX should be
+*                 KMAX = MIN(M,N)
+*
+                  DO KMAX = 0, MIN(M,N)+1
+*
+*                 Get a working copy of COPYA into A( 1:M,1:N ).
+*                 Get a working copy of COPYB into A( 1:M, (N+1):NRHS ).
+*                 Get a working copy of COPYB into into B( 1:M, 1:NRHS ).
+*                 Get a working copy of IWORK(1:N) awith zeroes into
+*                 which is going to be used as pivot array IWORK( N+1:2N ).
+*                 NOTE: IWORK(2N+1:3N) is going to be used as a WORK array
+*                 for the routine.
+*
+                  CALL ZLACPY( 'All', M, N, COPYA, LDA, A, LDA )
+                  CALL ZLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         A( LDA*N + 1 ),  LDA )
+                  CALL ZLACPY( 'All', M, NRHS, COPYB, LDA,
+     $                         B,  LDA )
+                  CALL ICOPY( N, IWORK( 1 ), 1, IWORK( N+1 ), 1 )
+*
+                  ABSTOL = -1.0
+                  RELTOl = -1.0
+*
+*                 Compute the QR factorization with pivoting of A
+*
+                  LW = MAX( 1, MAX( 2*N + NB*( N+NRHS+1 ),
+     $                              3*N + NRHS - 1 ) )
+*
+*                 Compute ZGEQP3RK factorization of A.
+*
+                  SRNAMT = 'ZGEQP3RK'
+                  CALL ZGEQP3RK( M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                           A, LDA, KFACT, MAXC2NRMK,
+     $                           RELMAXC2NRMK, IWORK( N+1 ), TAU,
+     $                           WORK, LW, RWORK, IWORK( 2*N+1 ),
+     $                           INFO )
+*
+*                 Check error code from ZGEQP3RK.
+*
+                  IF( INFO.LT.0 )
+     $               CALL ALAERH( PATH, 'ZGEQP3RK', INFO, 0, ' ',
+     $                            M, N, NX, -1, NB, IMAT,
+     $                            NFAIL, NERRS, NOUT )
+*
+                  IF( KFACT.EQ.MINMN ) THEN
+*
+*                 Compute test 1:
+*
+*                 This test in only for the full rank factorization of
+*                 the matrix A.
+*
+*                 Array S(1:min(M,N)) contains svd(A) the sigular values
+*                 of the original matrix A in decreasing absolute value
+*                 order. The test computes svd(R), the vector sigular
+*                 values of the upper trapezoid of A(1:M,1:N) that
+*                 contains the factor R, in decreasing order. The test
+*                 returns the ratio:
+*
+*                 2-norm(svd(R) - svd(A)) / ( max(M,N) * 2-norm(svd(A)) * EPS )
+*
+                     RESULT( 1 ) = ZQRT12( M, N, A, LDA, S, WORK,
+     $                                     LWORK , RWORK )
+*
+                     DO T = 1, 1
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL, NB, NX,
+     $                        IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                   End test 1
+*
+                  END IF
+
+*                 Compute test 2:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( A*P - Q*R ) / ( max(M,N) * 1-norm(A) * EPS )
+*
+                  RESULT( 2 ) = ZQPT01( M, N, KFACT, COPYA, A, LDA, TAU,
+     $                          IWORK( N+1 ), WORK, LWORK )
+*
+*                 Compute test 3:
+*
+*                 The test returns the ratio:
+*
+*                 1-norm( Q**T * Q - I ) / ( M * EPS )
+*
+                  RESULT( 3 ) = ZQRT11( M, KFACT, A, LDA, TAU, WORK,
+     $                          LWORK )
+*
+*                 Print information about the tests that did not pass
+*                 the threshold.
+*
+                  DO T = 2, 3
+                     IF( RESULT( T ).GE.THRESH ) THEN
+                        IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                     CALL ALAHD( NOUT, PATH )
+                        WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N,
+     $                      NRHS, KMAX, ABSTOL, RELTOL,
+     $                      NB, NX, IMAT, T, RESULT( T )
+                        NFAIL = NFAIL + 1
+                     END IF
+                  END DO
+                  NRUN = NRUN + 2
+*
+*                 Compute test 4:
+*
+*                 This test is only for the factorizations with the
+*                 rank greater than 2.
+*                 The elements on the diagonal of R should be non-
+*                 increasing.
+*
+*                 The test returns the ratio:
+*
+*                 Returns 1.0D+100 if abs(R(K+1,K+1)) > abs(R(K,K)),
+*                 K=1:KFACT-1
+*
+                  IF( MIN(KFACT, MINMN).GE.2 ) THEN
+*
+                     DO J = 1, KFACT-1, 1
+*
+                        DTEMP = (( ABS( A( (J-1)*M+J ) ) -
+     $                          ABS( A( (J)*M+J+1 ) ) ) /
+     $                          ABS( A(1) ) )
+*
+                        IF( DTEMP.LT.ZERO ) THEN
+                           RESULT( 4 ) = BIGNUM
+                        END IF
+*
+                     END DO
+*
+*                    Print information about the tests that did not
+*                    pass the threshold.
+*
+                     DO T = 4, 4
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK',
+     $                        M, N, NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T,
+     $                        RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End test 4.
+*
+                  END IF
+*
+*                 Compute test 5:
+*
+*                 This test in only for matrix A with min(M,N) > 0.
+*
+*                 The test returns the ratio:
+*
+*                 1-norm(Q**T * B - Q**T * B ) /
+*                       ( M * EPS )
+*
+*                 (1) Compute B:=Q**T * B in the matrix B.
+*
+                  IF( MINMN.GT.0 ) THEN
+*
+                     LWORK_MQR = MAX(1, NRHS)
+                     CALL ZUNMQR( 'Left', 'Conjugate transpose',
+     $                         M, NRHS, KFACT, A, LDA, TAU, B, LDA,
+     $                         WORK, LWORK_MQR, INFO )
+*
+                     DO I = 1, NRHS
+*
+*                       Compare N+J-th column of A and J-column of B.
+*
+                        CALL ZAXPY( M, -CONE, A( ( N+I-1 )*LDA+1 ), 1,
+     $                                    B( ( I-1 )*LDA+1 ), 1 )
+                     END DO
+*
+                     RESULT( 5 ) =
+     $               ABS(
+     $               ZLANGE( 'One-norm', M, NRHS, B, LDA, RDUMMY ) /
+     $               ( DBLE( M )*DLAMCH( 'Epsilon' ) )
+     $               )
+*
+*                    Print information about the tests that did not pass
+*                    the threshold.
+*
+                     DO T = 5, 5
+                        IF( RESULT( T ).GE.THRESH ) THEN
+                           IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                        CALL ALAHD( NOUT, PATH )
+                           WRITE( NOUT, FMT = 9999 ) 'ZGEQP3RK', M, N,
+     $                        NRHS, KMAX, ABSTOL, RELTOL,
+     $                        NB, NX, IMAT, T, RESULT( T )
+                           NFAIL = NFAIL + 1
+                        END IF
+                     END DO
+                     NRUN = NRUN + 1
+*
+*                    End compute test 5.
+*
+                  END IF
+*
+*                 END DO KMAX = 1, MIN(M,N)+1
+*
+                  END DO
+*
+*                 END DO for INB = 1, NNB
+*
+               END DO
+*
+*              END DO  for IMAT = 1, NTYPES
+*
+               END DO
+*
+*              END DO for INS = 1, NNS
+*
+            END DO
+*
+*           END DO for IN = 1, NN
+*
+         END DO
+*
+*        END DO for IM = 1, NM
+*
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 1X, A, ' M =', I5, ', N =', I5, ', NRHS =', I5,
+     $        ', KMAX =', I5, ', ABSTOL =', G12.5,
+     $        ', RELTOL =', G12.5, ', NB =', I4, ', NX =', I4,
+     $        ', type ', I2, ', test ', I2, ', ratio =', G12.5 )
+*
+*     End of ZCHKQP3RK
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/zchkqr.f b/lapack-netlib/TESTING/LIN/zchkqr.f
index a240d2da5..c088bacc9 100644
--- a/lapack-netlib/TESTING/LIN/zchkqr.f
+++ b/lapack-netlib/TESTING/LIN/zchkqr.f
@@ -244,7 +244,7 @@
       EXTERNAL           ZGENND
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZERRQR, ZGEQRS,
+      EXTERNAL           ALAERH, ALAHD, ALASUM, XLAENV, ZERRQR, ZGELS,
      $                   ZGET02, ZLACPY, ZLARHS, ZLATB4, ZLATMS, ZQRT01,
      $                   ZQRT01P, ZQRT02, ZQRT03
 *     ..
@@ -388,7 +388,7 @@
      $                               WORK, LWORK, RWORK, RESULT( 3 ) )
                         NT = NT + 4
 *
-*                       If M>=N and K=N, call ZGEQRS to solve a system
+*                       If M>=N and K=N, call ZGELS to solve a system
 *                       with NRHS right hand sides and compute the
 *                       residual.
 *
@@ -405,14 +405,20 @@
 *
                            CALL ZLACPY( 'Full', M, NRHS, B, LDA, X,
      $                                  LDA )
-                           SRNAMT = 'ZGEQRS'
-                           CALL ZGEQRS( M, N, NRHS, AF, LDA, TAU, X,
-     $                                  LDA, WORK, LWORK, INFO )
 *
-*                          Check error code from ZGEQRS.
+*                          Reset AF to the original matrix. ZGELS
+*                          factors the matrix before solving the system.
+*
+                           CALL ZLACPY( 'Full', M, N, A, LDA, AF, LDA )
+*
+                           SRNAMT = 'ZGELS'
+                           CALL ZGELS( 'No transpose', M, N, NRHS, AF,
+     $                                 LDA, X, LDA, WORK, LWORK, INFO )
+*
+*                          Check error code from ZGELS.
 *
                            IF( INFO.NE.0 )
-     $                        CALL ALAERH( PATH, 'ZGEQRS', INFO, 0, ' ',
+     $                        CALL ALAERH( PATH, 'ZGELS', INFO, 0, 'N',
      $                                     M, N, NRHS, -1, NB, IMAT,
      $                                     NFAIL, NERRS, NOUT )
 *
diff --git a/lapack-netlib/TESTING/LIN/zchktp.f b/lapack-netlib/TESTING/LIN/zchktp.f
index 1798c24e7..ab46f5ce5 100644
--- a/lapack-netlib/TESTING/LIN/zchktp.f
+++ b/lapack-netlib/TESTING/LIN/zchktp.f
@@ -87,7 +87,7 @@
 *> \verbatim
 *>          NMAX is INTEGER
 *>          The leading dimension of the work arrays.  NMAX >= the
-*>          maximumm value of N in NVAL.
+*>          maximum value of N in NVAL.
 *> \endverbatim
 *>
 *> \param[out] AP
diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f
index 275ca2857..4af538124 100644
--- a/lapack-netlib/TESTING/LIN/zchktr.f
+++ b/lapack-netlib/TESTING/LIN/zchktr.f
@@ -201,7 +201,8 @@
 *     .. Local Arrays ..
       CHARACTER          TRANSS( NTRAN ), UPLOS( 2 )
       INTEGER            ISEED( 4 ), ISEEDY( 4 )
-      DOUBLE PRECISION   RESULT( NTESTS ), SCALE3( 2 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RWORK2( 2*NMAX), 
+     $                   SCALE3( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -544,8 +545,8 @@
                   CALL ZCOPY( N, X, 1, B( N+1 ), 1 )
                   CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 )
                   CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA,
-     $                          B, MAX(1, N), SCALE3, RWORK, WORK, NMAX,
-     $                          INFO )
+     $                          B, MAX(1, N), SCALE3, RWORK, RWORK2,
+     $                          2*NMAX, INFO )
 *
 *                 Check error code from ZLATRS3.
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvab.f b/lapack-netlib/TESTING/LIN/zdrvab.f
index 130515959..772eb08af 100644
--- a/lapack-netlib/TESTING/LIN/zdrvab.f
+++ b/lapack-netlib/TESTING/LIN/zdrvab.f
@@ -348,7 +348,7 @@
                CALL ZGET08( TRANS, N, N, NRHS, A, LDA, X, LDA, WORK,
      $                      LDA, RWORK, RESULT( 1 ) )
 *
-*              Check if the test passes the tesing.
+*              Check if the test passes the testing.
 *              Print information about the tests that did not
 *              pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvac.f b/lapack-netlib/TESTING/LIN/zdrvac.f
index 20f8eb1e5..bbf73a263 100644
--- a/lapack-netlib/TESTING/LIN/zdrvac.f
+++ b/lapack-netlib/TESTING/LIN/zdrvac.f
@@ -367,7 +367,7 @@
                   CALL ZPOT06( UPLO, N, NRHS, A, LDA, X, LDA, WORK,
      $               LDA, RWORK, RESULT( 1 ) )
 *
-*                 Check if the test passes the tesing.
+*                 Check if the test passes the testing.
 *                 Print information about the tests that did not
 *                 pass the testing.
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f
index 9401867e0..fcd774491 100644
--- a/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zdrvhe_aa_2stage.f
@@ -400,9 +400,9 @@
 *                    Factor the matrix and solve the system using ZHESV_AA.
 *
                      SRNAMT = 'ZHESV_AA_2STAGE '
-                     LWORK = MIN(N*NB, 3*NMAX*NMAX)
+                     LWORK = MIN( MAX( 1, N*NB ), 3*NMAX*NMAX )
                      CALL ZHESV_AA_2STAGE( UPLO, N, NRHS, AFAC, LDA,
-     $                                 AINV, (3*NB+1)*N, 
+     $                                 AINV, MAX( 1, (3*NB+1)*N ),
      $                                 IWORK, IWORK( 1+N ),
      $                                 X, LDA, WORK, LWORK, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f
index 21497477c..5406a76bf 100644
--- a/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f
+++ b/lapack-netlib/TESTING/LIN/zdrvsy_aa_2stage.f
@@ -229,7 +229,7 @@
 *     Test path
 *
       PATH( 1: 1 ) = 'Zomplex precision'
-      PATH( 2: 3 ) = 'H2'
+      PATH( 2: 3 ) = 'S2'
 *
 *     Path to generate matrices
 *
diff --git a/lapack-netlib/TESTING/LIN/zerrhe.f b/lapack-netlib/TESTING/LIN/zerrhe.f
index e49e5037c..40dd25d75 100644
--- a/lapack-netlib/TESTING/LIN/zerrhe.f
+++ b/lapack-netlib/TESTING/LIN/zerrhe.f
@@ -135,7 +135,7 @@
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite matrix with patrial
+*        of a Hermitian indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZHETRF
@@ -580,7 +580,7 @@
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite packed matrix with patrial
+*        of a Hermitian indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZHPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrhex.f b/lapack-netlib/TESTING/LIN/zerrhex.f
index b6e8b77ef..bdcdefff7 100644
--- a/lapack-netlib/TESTING/LIN/zerrhex.f
+++ b/lapack-netlib/TESTING/LIN/zerrhex.f
@@ -138,7 +138,7 @@
       OK = .TRUE.
 *
 *     Test error exits of the routines that use factorization
-*     of a Hermitian indefinite matrix with patrial
+*     of a Hermitian indefinite matrix with partial
 *     (Bunch-Kaufman) diagonal pivoting method.
 *
       IF( LSAMEN( 2, C2, 'HE' ) ) THEN
@@ -526,7 +526,7 @@
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a Hermitian indefinite packed matrix with patrial
+*        of a Hermitian indefinite packed matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZHPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrlq.f b/lapack-netlib/TESTING/LIN/zerrlq.f
index d8e5a8fe8..d91b4e4b3 100644
--- a/lapack-netlib/TESTING/LIN/zerrlq.f
+++ b/lapack-netlib/TESTING/LIN/zerrlq.f
@@ -76,7 +76,7 @@
      $                   W( NMAX ), X( NMAX )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ALAESM, CHKXER, ZGELQ2, ZGELQF, ZGELQS, ZUNGL2,
+      EXTERNAL           ALAESM, CHKXER, ZGELQ2, ZGELQF, ZUNGL2,
      $                   ZUNGLQ, ZUNML2, ZUNMLQ
 *     ..
 *     .. Scalars in Common ..
@@ -142,31 +142,6 @@
       CALL ZGELQ2( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'ZGELQ2', INFOT, NOUT, LERR, OK )
 *
-*     ZGELQS
-*
-      SRNAMT = 'ZGELQS'
-      INFOT = 1
-      CALL ZGELQS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL ZGELQS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL ZGELQS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL ZGELQS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL ZGELQS( 2, 2, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL ZGELQS( 1, 2, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL ZGELQS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGELQS', INFOT, NOUT, LERR, OK )
-*
 *     ZUNGLQ
 *
       SRNAMT = 'ZUNGLQ'
diff --git a/lapack-netlib/TESTING/LIN/zerrqr.f b/lapack-netlib/TESTING/LIN/zerrqr.f
index 114453d4c..3542c7a04 100644
--- a/lapack-netlib/TESTING/LIN/zerrqr.f
+++ b/lapack-netlib/TESTING/LIN/zerrqr.f
@@ -77,7 +77,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAESM, CHKXER, ZGEQR2, ZGEQR2P, ZGEQRF,
-     $                   ZGEQRFP, ZGEQRS, ZUNG2R, ZUNGQR, ZUNM2R,
+     $                   ZGEQRFP, ZUNG2R, ZUNGQR, ZUNM2R,
      $                   ZUNMQR
 *     ..
 *     .. Scalars in Common ..
@@ -172,31 +172,6 @@
       CALL ZGEQR2P( 2, 1, A, 1, B, W, INFO )
       CALL CHKXER( 'ZGEQR2P', INFOT, NOUT, LERR, OK )
 *
-*     ZGEQRS
-*
-      SRNAMT = 'ZGEQRS'
-      INFOT = 1
-      CALL ZGEQRS( -1, 0, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL ZGEQRS( 0, -1, 0, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 2
-      CALL ZGEQRS( 1, 2, 0, A, 2, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 3
-      CALL ZGEQRS( 0, 0, -1, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 5
-      CALL ZGEQRS( 2, 1, 0, A, 1, X, B, 2, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 8
-      CALL ZGEQRS( 2, 1, 0, A, 2, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-      INFOT = 10
-      CALL ZGEQRS( 1, 1, 2, A, 1, X, B, 1, W, 1, INFO )
-      CALL CHKXER( 'ZGEQRS', INFOT, NOUT, LERR, OK )
-*
 *     ZUNGQR
 *
       SRNAMT = 'ZUNGQR'
diff --git a/lapack-netlib/TESTING/LIN/zerrsy.f b/lapack-netlib/TESTING/LIN/zerrsy.f
index a50f9d24e..932df1936 100644
--- a/lapack-netlib/TESTING/LIN/zerrsy.f
+++ b/lapack-netlib/TESTING/LIN/zerrsy.f
@@ -132,7 +132,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZSYTRF
@@ -471,7 +471,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        ZSPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrsyx.f b/lapack-netlib/TESTING/LIN/zerrsyx.f
index 23d2a5a2b..9d5e71288 100644
--- a/lapack-netlib/TESTING/LIN/zerrsyx.f
+++ b/lapack-netlib/TESTING/LIN/zerrsyx.f
@@ -139,7 +139,7 @@
       IF( LSAMEN( 2, C2, 'SY' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite matrix with patrial
+*        of a symmetric indefinite matrix with partial
 *        (Bunch-Kaufman) diagonal pivoting method.
 *
 *        ZSYTRF
@@ -525,7 +525,7 @@
       ELSE IF( LSAMEN( 2, C2, 'SP' ) ) THEN
 *
 *        Test error exits of the routines that use factorization
-*        of a symmetric indefinite packed matrix with patrial
+*        of a symmetric indefinite packed matrix with partial
 *        (Bunch-Kaufman) pivoting.
 *
 *        ZSPTRF
diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f
index 211b92154..640c39793 100644
--- a/lapack-netlib/TESTING/LIN/zerrtr.f
+++ b/lapack-netlib/TESTING/LIN/zerrtr.f
@@ -70,7 +70,7 @@
 *     .. Local Scalars ..
       CHARACTER*2        C2
       INTEGER            INFO
-      DOUBLE PRECISION   RCOND, SCALE
+      DOUBLE PRECISION   RCOND, SCALE, SCALES(0)
 *     ..
 *     .. Local Arrays ..
       DOUBLE PRECISION   R1( NMAX ), R2( NMAX ), RW( NMAX )
@@ -245,40 +245,40 @@
 *
          SRNAMT = 'ZLATRS3'
          INFOT = 1
-         CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 2
-         CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 3
-         CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 4
-         CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 5
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 6
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 8
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 10
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW,
-     $                 RW( 2 ), 1, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES,
+     $                 RW, RW( 2 ), 1, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
          INFOT = 14
-         CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW,
-     $                 RW( 2 ), 0, INFO )
+         CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES,
+     $                 RW, RW( 2 ), 0, INFO )
          CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK )
 *
 *     Test error exits for the packed triangular routines.
diff --git a/lapack-netlib/TESTING/LIN/zerrvx.f b/lapack-netlib/TESTING/LIN/zerrvx.f
index ea7823df3..80b3aaf4a 100644
--- a/lapack-netlib/TESTING/LIN/zerrvx.f
+++ b/lapack-netlib/TESTING/LIN/zerrvx.f
@@ -653,6 +653,9 @@
          INFOT = 3
          CALL ZHESV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZHESV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL ZHESV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZHESV_ROOK', INFOT, NOUT, LERR, OK )
@@ -700,21 +703,27 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'HA' ) ) THEN
 *
-*        ZHESV_AA
-*
-        SRNAMT = 'ZHESV_AA'
-        INFOT = 1
-        CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 2
-        CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 3
-        CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
-        INFOT = 8
-        CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
-        CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+*        ZHESV_AASEN
+*
+         SRNAMT = 'ZHESV_AA'
+         INFOT = 1
+         CALL ZHESV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZHESV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZHESV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZHESV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZHESV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZHESV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'ZHESV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN
 *
@@ -735,16 +744,44 @@
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+*
+      ELSE IF( LSAMEN( 2, C2, 'SA' ) ) THEN
+*
+*        ZSYSV_AASEN
+*
+         SRNAMT = 'ZSYSV_AA'
+         INFOT = 1
+         CALL ZSYSV_AA( '/', 0, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZSYSV_AA( 'U', -1, 0, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZSYSV_AA( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV_AA( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 8
+         CALL ZSYSV_AA( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
+         INFOT = 10
+         CALL ZSYSV_AA( 'U', 3, 1, A, 3, IP, B, 3, W, 6, INFO )
+         CALL CHKXER( 'ZSYSV_AA', INFOT, NOUT, LERR, OK )
 *
       ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
 *
@@ -765,17 +802,21 @@
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 5
          CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
          CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-         INFOT = 7
-         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+         INFOT = 13
+         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 2,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
-**
+*
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        ZHPSV
@@ -836,6 +877,9 @@
          INFOT = 3
          CALL ZSYSV( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL ZSYSV( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV ', INFOT, NOUT, LERR, OK )
@@ -900,6 +944,9 @@
          INFOT = 3
          CALL ZSYSV_ROOK( 'U', 0, -1, A, 1, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV_ROOK( 'U', 2, 0, A, 1, IP, B, 2, W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK )
          INFOT = 8
          CALL ZSYSV_ROOK( 'U', 2, 0, A, 2, IP, B, 1, W, 1, INFO )
          CALL CHKXER( 'ZSYSV_ROOK', INFOT, NOUT, LERR, OK )
diff --git a/lapack-netlib/TESTING/LIN/zgtt01.f b/lapack-netlib/TESTING/LIN/zgtt01.f
index c63fdbc25..a8ac13d2d 100644
--- a/lapack-netlib/TESTING/LIN/zgtt01.f
+++ b/lapack-netlib/TESTING/LIN/zgtt01.f
@@ -39,7 +39,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zgtt02.f b/lapack-netlib/TESTING/LIN/zgtt02.f
index 7362967be..f86fe9244 100644
--- a/lapack-netlib/TESTING/LIN/zgtt02.f
+++ b/lapack-netlib/TESTING/LIN/zgtt02.f
@@ -40,14 +40,14 @@
 *> \verbatim
 *>          TRANS is CHARACTER
 *>          Specifies the form of the residual.
-*>          = 'N':  B - A * X     (No transpose)
+*>          = 'N':  B - A    * X  (No transpose)
 *>          = 'T':  B - A**T * X  (Transpose)
 *>          = 'C':  B - A**H * X  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.  N >= 0.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zhet01_3.f b/lapack-netlib/TESTING/LIN/zhet01_3.f
index 0a76404d6..5beed9042 100644
--- a/lapack-netlib/TESTING/LIN/zhet01_3.f
+++ b/lapack-netlib/TESTING/LIN/zhet01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/LIN/zlatb4.f b/lapack-netlib/TESTING/LIN/zlatb4.f
index a6977f4e9..a2b19f83d 100644
--- a/lapack-netlib/TESTING/LIN/zlatb4.f
+++ b/lapack-netlib/TESTING/LIN/zlatb4.f
@@ -154,9 +154,6 @@
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, SQRT
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLABAD
-*     ..
 *     .. Save statement ..
       SAVE               EPS, SMALL, LARGE, BADC1, BADC2, FIRST
 *     ..
@@ -174,11 +171,6 @@
          BADC1 = SQRT( BADC2 )
          SMALL = DLAMCH( 'Safe minimum' )
          LARGE = ONE / SMALL
-*
-*        If it looks like we're on a Cray, take the square root of
-*        SMALL and LARGE to avoid overflow and underflow problems.
-*
-         CALL DLABAD( SMALL, LARGE )
          SMALL = SHRINK*( SMALL / EPS )
          LARGE = ONE / SMALL
       END IF
@@ -233,6 +225,110 @@
          ELSE
             ANORM = ONE
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'QK' ) ) THEN
+*
+*        xQK: truncated QR with pivoting.
+*             Set parameters to generate a general
+*             M x N matrix.
+*
+*        Set TYPE, the type of matrix to be generated.  'N' is nonsymmetric.
+*
+         TYPE = 'N'
+*
+*        Set DIST, the type of distribution for the random
+*        number generator. 'S' is
+*
+         DIST = 'S'
+*
+*        Set the lower and upper bandwidths.
+*
+         IF( IMAT.EQ.2 ) THEN
+*
+*           2. Random, Diagonal, CNDNUM = 2
+*
+            KL = 0
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.3 ) THEN
+*
+*           3. Random, Upper triangular,  CNDNUM = 2
+*
+            KL = 0
+            KU = MAX( N-1, 0 )
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE IF( IMAT.EQ.4 ) THEN
+*
+*          4. Random, Lower triangular,  CNDNUM = 2
+*
+            KL = MAX( M-1, 0 )
+            KU = 0
+            CNDNUM = TWO
+            ANORM = ONE
+            MODE = 3
+         ELSE
+*
+*           5.-19. Rectangular matrix
+*
+            KL = MAX( M-1, 0 )
+            KU = MAX( N-1, 0 )
+*
+            IF( IMAT.GE.5 .AND. IMAT.LE.14 ) THEN
+*
+*              5.-14. Random, CNDNUM = 2.
+*
+               CNDNUM = TWO
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.15 ) THEN
+*
+*              15. Random, CNDNUM = sqrt(0.1/EPS)
+*
+               CNDNUM = BADC1
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.16 ) THEN
+*
+*              16. Random, CNDNUM = 0.1/EPS
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.17 ) THEN
+*
+*              17. Random, CNDNUM = 0.1/EPS,
+*                  one small singular value S(N)=1/CNDNUM
+*
+               CNDNUM = BADC2
+               ANORM = ONE
+               MODE = 2
+*
+            ELSE IF( IMAT.EQ.18 ) THEN
+*
+*              18. Random, scaled near underflow
+*
+               CNDNUM = TWO
+               ANORM = SMALL
+               MODE = 3
+*
+            ELSE IF( IMAT.EQ.19 ) THEN
+*
+*              19. Random, scaled near overflow
+*
+               CNDNUM = TWO
+               ANORM = LARGE
+               MODE = 3
+*
+            END IF
+*
+         END IF
 *
       ELSE IF( LSAMEN( 2, C2, 'GE' ) ) THEN
 *
@@ -517,17 +613,18 @@
 *
 *        Set the norm and condition number.
 *
-         IF( IMAT.EQ.2 .OR. IMAT.EQ.8 ) THEN
+         MAT = ABS( IMAT )
+         IF( MAT.EQ.2 .OR. MAT.EQ.8 ) THEN
             CNDNUM = BADC1
-         ELSE IF( IMAT.EQ.3 .OR. IMAT.EQ.9 ) THEN
+         ELSE IF( MAT.EQ.3 .OR. MAT.EQ.9 ) THEN
             CNDNUM = BADC2
          ELSE
             CNDNUM = TWO
          END IF
 *
-         IF( IMAT.EQ.4 ) THEN
+         IF( MAT.EQ.4 ) THEN
             ANORM = SMALL
-         ELSE IF( IMAT.EQ.5 ) THEN
+         ELSE IF( MAT.EQ.5 ) THEN
             ANORM = LARGE
          ELSE
             ANORM = ONE
diff --git a/lapack-netlib/TESTING/LIN/zlqt02.f b/lapack-netlib/TESTING/LIN/zlqt02.f
index c55d76ccd..9ba98ff1f 100644
--- a/lapack-netlib/TESTING/LIN/zlqt02.f
+++ b/lapack-netlib/TESTING/LIN/zlqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZLQT02 tests ZUNGLQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the LQ factorization of an m-by-n matrix A, ZLQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zptt01.f b/lapack-netlib/TESTING/LIN/zptt01.f
index e842c7e46..9f2359c2a 100644
--- a/lapack-netlib/TESTING/LIN/zptt01.f
+++ b/lapack-netlib/TESTING/LIN/zptt01.f
@@ -36,7 +36,7 @@
 *
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zptt02.f b/lapack-netlib/TESTING/LIN/zptt02.f
index 6e3a8aed0..8a1e2961c 100644
--- a/lapack-netlib/TESTING/LIN/zptt02.f
+++ b/lapack-netlib/TESTING/LIN/zptt02.f
@@ -46,7 +46,7 @@
 *>
 *> \param[in] N
 *> \verbatim
-*>          N is INTEGTER
+*>          N is INTEGER
 *>          The order of the matrix A.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/LIN/zqlt02.f b/lapack-netlib/TESTING/LIN/zqlt02.f
index 1f84cfa5a..7cb889931 100644
--- a/lapack-netlib/TESTING/LIN/zqlt02.f
+++ b/lapack-netlib/TESTING/LIN/zqlt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZQLT02 tests ZUNGQL, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QL factorization of an m-by-n matrix A, ZQLT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zqpt01.f b/lapack-netlib/TESTING/LIN/zqpt01.f
index 4e53f92c8..c69eb658f 100644
--- a/lapack-netlib/TESTING/LIN/zqpt01.f
+++ b/lapack-netlib/TESTING/LIN/zqpt01.f
@@ -33,7 +33,7 @@
 *> Householder vectors, and the rest of AF contains a partially updated
 *> matrix.
 *>
-*> This function returns ||A*P - Q*R||/(||norm(A)||*eps*M)
+*> This function returns ||A*P - Q*R|| / ( ||norm(A)||*eps*max(M,N) )
 *> \endverbatim
 *
 *  Arguments:
@@ -172,28 +172,28 @@
 *
       NORMA = ZLANGE( 'One-norm', M, N, A, LDA, RWORK )
 *
-      DO 30 J = 1, K
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, K
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = AF( I, J )
-   10    CONTINUE
-         DO 20 I = J + 1, M
+         END DO
+         DO I = J + 1, M
             WORK( ( J-1 )*M+I ) = ZERO
-   20    CONTINUE
-   30 CONTINUE
-      DO 40 J = K + 1, N
+         END DO
+      END DO
+      DO J = K + 1, N
          CALL ZCOPY( M, AF( 1, J ), 1, WORK( ( J-1 )*M+1 ), 1 )
-   40 CONTINUE
+      END DO
 *
       CALL ZUNMQR( 'Left', 'No transpose', M, N, K, AF, LDA, TAU, WORK,
      $             M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
-      DO 50 J = 1, N
+      DO J = 1, N
 *
 *        Compare i-th column of QR and jpvt(i)-th column of A
 *
          CALL ZAXPY( M, DCMPLX( -ONE ), A( 1, JPVT( J ) ), 1,
      $               WORK( ( J-1 )*M+1 ), 1 )
-   50 CONTINUE
+      END DO
 *
       ZQPT01 = ZLANGE( 'One-norm', M, N, WORK, M, RWORK ) /
      $         ( DBLE( MAX( M, N ) )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/zqrt02.f b/lapack-netlib/TESTING/LIN/zqrt02.f
index 2dbefaf84..a32703c49 100644
--- a/lapack-netlib/TESTING/LIN/zqrt02.f
+++ b/lapack-netlib/TESTING/LIN/zqrt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZQRT02 tests ZUNGQR, which generates an m-by-n matrix Q with
-*> orthonornmal columns that is defined as the product of k elementary
+*> orthonormal columns that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the QR factorization of an m-by-n matrix A, ZQRT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zqrt11.f b/lapack-netlib/TESTING/LIN/zqrt11.f
index c3be59c36..dc4af744f 100644
--- a/lapack-netlib/TESTING/LIN/zqrt11.f
+++ b/lapack-netlib/TESTING/LIN/zqrt11.f
@@ -158,9 +158,9 @@
       CALL ZUNM2R( 'Left', 'Conjugate transpose', M, M, K, A, LDA, TAU,
      $             WORK, M, WORK( M*M+1 ), INFO )
 *
-      DO 10 J = 1, M
+      DO J = 1, M
          WORK( ( J-1 )*M+J ) = WORK( ( J-1 )*M+J ) - ONE
-   10 CONTINUE
+      END DO
 *
       ZQRT11 = ZLANGE( 'One-norm', M, M, WORK, M, RDUMMY ) /
      $         ( DBLE( M )*DLAMCH( 'Epsilon' ) )
diff --git a/lapack-netlib/TESTING/LIN/zqrt12.f b/lapack-netlib/TESTING/LIN/zqrt12.f
index 0da6be157..91477b5ea 100644
--- a/lapack-netlib/TESTING/LIN/zqrt12.f
+++ b/lapack-netlib/TESTING/LIN/zqrt12.f
@@ -28,7 +28,7 @@
 *> ZQRT12 computes the singular values `svlues' of the upper trapezoid
 *> of A(1:M,1:N) and returns the ratio
 *>
-*>      || s - svlues||/(||svlues||*eps*max(M,N))
+*>      || svlues - s||/(||s||*eps*max(M,N))
 *> \endverbatim
 *
 *  Arguments:
@@ -125,8 +125,8 @@
       EXTERNAL           DASUM, DLAMCH, DNRM2, ZLANGE
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DAXPY, DBDSQR, DLABAD, DLASCL, XERBLA, ZGEBD2,
-     $                   ZLASCL, ZLASET
+      EXTERNAL           DAXPY, DBDSQR, DLASCL, XERBLA, ZGEBD2, ZLASCL,
+     $                   ZLASET
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          DBLE, DCMPLX, MAX, MIN
@@ -154,17 +154,16 @@
 *
       CALL ZLASET( 'Full', M, N, DCMPLX( ZERO ), DCMPLX( ZERO ), WORK,
      $             M )
-      DO 20 J = 1, N
-         DO 10 I = 1, MIN( J, M )
+      DO J = 1, N
+         DO I = 1, MIN( J, M )
             WORK( ( J-1 )*M+I ) = A( I, J )
-   10    CONTINUE
-   20 CONTINUE
+         END DO
+      END DO
 *
 *     Get machine parameters
 *
       SMLNUM = DLAMCH( 'S' ) / DLAMCH( 'P' )
       BIGNUM = ONE / SMLNUM
-      CALL DLABAD( SMLNUM, BIGNUM )
 *
 *     Scale work if max entry outside range [SMLNUM,BIGNUM]
 *
@@ -208,9 +207,9 @@
 *
       ELSE
 *
-         DO 30 I = 1, MN
+         DO I = 1, MN
             RWORK( I ) = ZERO
-   30    CONTINUE
+         END DO
       END IF
 *
 *     Compare s and singular values of work
@@ -218,6 +217,7 @@
       CALL DAXPY( MN, -ONE, S, 1, RWORK( 1 ), 1 )
       ZQRT12 = DASUM( MN, RWORK( 1 ), 1 ) /
      $         ( DLAMCH( 'Epsilon' )*DBLE( MAX( M, N ) ) )
+*
       IF( NRMSVL.NE.ZERO )
      $   ZQRT12 = ZQRT12 / NRMSVL
 *
diff --git a/lapack-netlib/TESTING/LIN/zrqt02.f b/lapack-netlib/TESTING/LIN/zrqt02.f
index 548321d00..c44e04cb6 100644
--- a/lapack-netlib/TESTING/LIN/zrqt02.f
+++ b/lapack-netlib/TESTING/LIN/zrqt02.f
@@ -27,7 +27,7 @@
 *> \verbatim
 *>
 *> ZRQT02 tests ZUNGRQ, which generates an m-by-n matrix Q with
-*> orthonornmal rows that is defined as the product of k elementary
+*> orthonormal rows that is defined as the product of k elementary
 *> reflectors.
 *>
 *> Given the RQ factorization of an m-by-n matrix A, ZRQT02 generates
diff --git a/lapack-netlib/TESTING/LIN/zrzt01.f b/lapack-netlib/TESTING/LIN/zrzt01.f
index 2eba7ba6f..b6e4d73f9 100644
--- a/lapack-netlib/TESTING/LIN/zrzt01.f
+++ b/lapack-netlib/TESTING/LIN/zrzt01.f
@@ -159,7 +159,7 @@
 *
 *     R = R * P(1) * ... *P(m)
 *
-      CALL ZUNMRZ( 'Right', 'No tranpose', M, N, M, N-M, AF, LDA, TAU,
+      CALL ZUNMRZ( 'Right', 'No transpose', M, N, M, N-M, AF, LDA, TAU,
      $             WORK, M, WORK( M*N+1 ), LWORK-M*N, INFO )
 *
 *     R = R - A
diff --git a/lapack-netlib/TESTING/LIN/zsyt01_3.f b/lapack-netlib/TESTING/LIN/zsyt01_3.f
index 202488db9..a572353f2 100644
--- a/lapack-netlib/TESTING/LIN/zsyt01_3.f
+++ b/lapack-netlib/TESTING/LIN/zsyt01_3.f
@@ -188,7 +188,7 @@
          RETURN
       END IF
 *
-*     a) Revert to multiplyers of L
+*     a) Revert to multipliers of L
 *
       CALL ZSYCONVF_ROOK( UPLO, 'R', N, AFAC, LDAFAC, E, IPIV, INFO )
 *
diff --git a/lapack-netlib/TESTING/MATGEN/clagge.c b/lapack-netlib/TESTING/MATGEN/clagge.c
index f05905bd7..62c33d01e 100644
--- a/lapack-netlib/TESTING/MATGEN/clagge.c
+++ b/lapack-netlib/TESTING/MATGEN/clagge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/claghe.c b/lapack-netlib/TESTING/MATGEN/claghe.c
index 77ed94912..009329dd8 100644
--- a/lapack-netlib/TESTING/MATGEN/claghe.c
+++ b/lapack-netlib/TESTING/MATGEN/claghe.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clagsy.c b/lapack-netlib/TESTING/MATGEN/clagsy.c
index de215bc63..5117f85db 100644
--- a/lapack-netlib/TESTING/MATGEN/clagsy.c
+++ b/lapack-netlib/TESTING/MATGEN/clagsy.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clahilb.c b/lapack-netlib/TESTING/MATGEN/clahilb.c
index 95b00fb3e..1ee293c6f 100644
--- a/lapack-netlib/TESTING/MATGEN/clahilb.c
+++ b/lapack-netlib/TESTING/MATGEN/clahilb.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clakf2.c b/lapack-netlib/TESTING/MATGEN/clakf2.c
index e15bc58ab..2b1b4e905 100644
--- a/lapack-netlib/TESTING/MATGEN/clakf2.c
+++ b/lapack-netlib/TESTING/MATGEN/clakf2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clarge.c b/lapack-netlib/TESTING/MATGEN/clarge.c
index 0215436b7..25801da89 100644
--- a/lapack-netlib/TESTING/MATGEN/clarge.c
+++ b/lapack-netlib/TESTING/MATGEN/clarge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clarnd.c b/lapack-netlib/TESTING/MATGEN/clarnd.c
index 260f246d5..26a22a3e0 100644
--- a/lapack-netlib/TESTING/MATGEN/clarnd.c
+++ b/lapack-netlib/TESTING/MATGEN/clarnd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b CLARND */
diff --git a/lapack-netlib/TESTING/MATGEN/claror.c b/lapack-netlib/TESTING/MATGEN/claror.c
index cd0d15300..b0d73f37c 100644
--- a/lapack-netlib/TESTING/MATGEN/claror.c
+++ b/lapack-netlib/TESTING/MATGEN/claror.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clarot.c b/lapack-netlib/TESTING/MATGEN/clarot.c
index f1e73ed5b..bd5f01cd3 100644
--- a/lapack-netlib/TESTING/MATGEN/clarot.c
+++ b/lapack-netlib/TESTING/MATGEN/clarot.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm1.c b/lapack-netlib/TESTING/MATGEN/clatm1.c
index 665b16057..2651091c8 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm1.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm1.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,11 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
@@ -279,235 +273,6 @@ static float spow_ui(float x, integer n) {
 	}
 	return pow;
 }
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm2.c b/lapack-netlib/TESTING/MATGEN/clatm2.c
index cf60fa296..2dceff497 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm2.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b CLATM2 */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm3.c b/lapack-netlib/TESTING/MATGEN/clatm3.c
index 58cd4e551..fcd8dbfcb 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm3.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm3.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b CLATM3 */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm5.c b/lapack-netlib/TESTING/MATGEN/clatm5.c
index c2b81ccf3..8fbc1c0a6 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm5.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm5.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatm6.c b/lapack-netlib/TESTING/MATGEN/clatm6.c
index 96a3bd290..f6992a362 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm6.c
+++ b/lapack-netlib/TESTING/MATGEN/clatm6.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatme.c b/lapack-netlib/TESTING/MATGEN/clatme.c
index a905f5608..fd0aec8ec 100644
--- a/lapack-netlib/TESTING/MATGEN/clatme.c
+++ b/lapack-netlib/TESTING/MATGEN/clatme.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatmr.c b/lapack-netlib/TESTING/MATGEN/clatmr.c
index 4a117a9d4..2f987f744 100644
--- a/lapack-netlib/TESTING/MATGEN/clatmr.c
+++ b/lapack-netlib/TESTING/MATGEN/clatmr.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatms.c b/lapack-netlib/TESTING/MATGEN/clatms.c
index 3f7011166..006b33b7a 100644
--- a/lapack-netlib/TESTING/MATGEN/clatms.c
+++ b/lapack-netlib/TESTING/MATGEN/clatms.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/clatmt.c b/lapack-netlib/TESTING/MATGEN/clatmt.c
index 3bb7ca4d9..f6a842861 100644
--- a/lapack-netlib/TESTING/MATGEN/clatmt.c
+++ b/lapack-netlib/TESTING/MATGEN/clatmt.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlagge.c b/lapack-netlib/TESTING/MATGEN/dlagge.c
index dcca84326..5482c3ff6 100644
--- a/lapack-netlib/TESTING/MATGEN/dlagge.c
+++ b/lapack-netlib/TESTING/MATGEN/dlagge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlagsy.c b/lapack-netlib/TESTING/MATGEN/dlagsy.c
index 2812db86d..efcbe3f99 100644
--- a/lapack-netlib/TESTING/MATGEN/dlagsy.c
+++ b/lapack-netlib/TESTING/MATGEN/dlagsy.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlahilb.c b/lapack-netlib/TESTING/MATGEN/dlahilb.c
index dccfcd8c7..d671adee6 100644
--- a/lapack-netlib/TESTING/MATGEN/dlahilb.c
+++ b/lapack-netlib/TESTING/MATGEN/dlahilb.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlakf2.c b/lapack-netlib/TESTING/MATGEN/dlakf2.c
index 0b403540a..b4a09204b 100644
--- a/lapack-netlib/TESTING/MATGEN/dlakf2.c
+++ b/lapack-netlib/TESTING/MATGEN/dlakf2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlaran.c b/lapack-netlib/TESTING/MATGEN/dlaran.c
index 3aed1fce0..f13ada834 100644
--- a/lapack-netlib/TESTING/MATGEN/dlaran.c
+++ b/lapack-netlib/TESTING/MATGEN/dlaran.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLARAN */
diff --git a/lapack-netlib/TESTING/MATGEN/dlarge.c b/lapack-netlib/TESTING/MATGEN/dlarge.c
index 5d8a81387..5cc7fbce8 100644
--- a/lapack-netlib/TESTING/MATGEN/dlarge.c
+++ b/lapack-netlib/TESTING/MATGEN/dlarge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlarnd.c b/lapack-netlib/TESTING/MATGEN/dlarnd.c
index 8e0f43ee5..3119097bf 100644
--- a/lapack-netlib/TESTING/MATGEN/dlarnd.c
+++ b/lapack-netlib/TESTING/MATGEN/dlarnd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLARND */
diff --git a/lapack-netlib/TESTING/MATGEN/dlaror.c b/lapack-netlib/TESTING/MATGEN/dlaror.c
index d9e2e46ae..fdd126174 100644
--- a/lapack-netlib/TESTING/MATGEN/dlaror.c
+++ b/lapack-netlib/TESTING/MATGEN/dlaror.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlarot.c b/lapack-netlib/TESTING/MATGEN/dlarot.c
index 1d7a4939d..3eadf15c6 100644
--- a/lapack-netlib/TESTING/MATGEN/dlarot.c
+++ b/lapack-netlib/TESTING/MATGEN/dlarot.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm1.c b/lapack-netlib/TESTING/MATGEN/dlatm1.c
index aa81469e7..aa58300a3 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm1.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm1.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,223 +273,6 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLATM1 */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm2.c b/lapack-netlib/TESTING/MATGEN/dlatm2.c
index d74bc9168..7491e9829 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm2.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLATM2 */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm3.c b/lapack-netlib/TESTING/MATGEN/dlatm3.c
index 86f964ced..a9d26c7fc 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm3.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm3.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLATM3 */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm5.c b/lapack-netlib/TESTING/MATGEN/dlatm5.c
index 94b49d6e3..7f1c36428 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm5.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm5.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm6.c b/lapack-netlib/TESTING/MATGEN/dlatm6.c
index fee10724a..0b3fbb42f 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm6.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm6.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm7.c b/lapack-netlib/TESTING/MATGEN/dlatm7.c
index eb42e2b8b..c907feff4 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm7.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatm7.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,223 +273,6 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b DLATM7 */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatme.c b/lapack-netlib/TESTING/MATGEN/dlatme.c
index a92c70ef2..e29df164c 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatme.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatme.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatmr.c b/lapack-netlib/TESTING/MATGEN/dlatmr.c
index e6ce5456d..77e224840 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatmr.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatmr.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatms.c b/lapack-netlib/TESTING/MATGEN/dlatms.c
index d56d26a50..c9a8b226e 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatms.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatms.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/dlatmt.c b/lapack-netlib/TESTING/MATGEN/dlatmt.c
index b61e8b779..6b0871257 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatmt.c
+++ b/lapack-netlib/TESTING/MATGEN/dlatmt.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slagge.c b/lapack-netlib/TESTING/MATGEN/slagge.c
index c3ec8f2b8..9213d9688 100644
--- a/lapack-netlib/TESTING/MATGEN/slagge.c
+++ b/lapack-netlib/TESTING/MATGEN/slagge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slagsy.c b/lapack-netlib/TESTING/MATGEN/slagsy.c
index 62614593d..fad4cf5fa 100644
--- a/lapack-netlib/TESTING/MATGEN/slagsy.c
+++ b/lapack-netlib/TESTING/MATGEN/slagsy.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slahilb.c b/lapack-netlib/TESTING/MATGEN/slahilb.c
index ae633d3c9..7367d71a2 100644
--- a/lapack-netlib/TESTING/MATGEN/slahilb.c
+++ b/lapack-netlib/TESTING/MATGEN/slahilb.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slakf2.c b/lapack-netlib/TESTING/MATGEN/slakf2.c
index 67be11662..388d630e1 100644
--- a/lapack-netlib/TESTING/MATGEN/slakf2.c
+++ b/lapack-netlib/TESTING/MATGEN/slakf2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slaran.c b/lapack-netlib/TESTING/MATGEN/slaran.c
index 43982d860..af9b56412 100644
--- a/lapack-netlib/TESTING/MATGEN/slaran.c
+++ b/lapack-netlib/TESTING/MATGEN/slaran.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLARAN */
diff --git a/lapack-netlib/TESTING/MATGEN/slarge.c b/lapack-netlib/TESTING/MATGEN/slarge.c
index 6b37e9400..d5fbd541c 100644
--- a/lapack-netlib/TESTING/MATGEN/slarge.c
+++ b/lapack-netlib/TESTING/MATGEN/slarge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slarnd.c b/lapack-netlib/TESTING/MATGEN/slarnd.c
index af9bf4a57..eabfa13c0 100644
--- a/lapack-netlib/TESTING/MATGEN/slarnd.c
+++ b/lapack-netlib/TESTING/MATGEN/slarnd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLARND */
diff --git a/lapack-netlib/TESTING/MATGEN/slaror.c b/lapack-netlib/TESTING/MATGEN/slaror.c
index 48b532dfd..7e3065432 100644
--- a/lapack-netlib/TESTING/MATGEN/slaror.c
+++ b/lapack-netlib/TESTING/MATGEN/slaror.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slarot.c b/lapack-netlib/TESTING/MATGEN/slarot.c
index a23e186e1..8d2f51f92 100644
--- a/lapack-netlib/TESTING/MATGEN/slarot.c
+++ b/lapack-netlib/TESTING/MATGEN/slarot.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm1.c b/lapack-netlib/TESTING/MATGEN/slatm1.c
index 5c4cdc067..8acbdfa3e 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm1.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm1.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,11 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
@@ -279,235 +273,6 @@ static float spow_ui(float x, integer n) {
 	}
 	return pow;
 }
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLATM1 */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm2.c b/lapack-netlib/TESTING/MATGEN/slatm2.c
index e7b72006f..833ee5dea 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm2.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm2.c
@@ -261,253 +261,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLATM2 */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm3.c b/lapack-netlib/TESTING/MATGEN/slatm3.c
index 4f9f5fee2..cdf96ef51 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm3.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm3.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLATM3 */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm5.c b/lapack-netlib/TESTING/MATGEN/slatm5.c
index 24ee0915d..9122bc041 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm5.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm5.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm6.c b/lapack-netlib/TESTING/MATGEN/slatm6.c
index c50fcd29f..6e3306d29 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm6.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm6.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatm7.c b/lapack-netlib/TESTING/MATGEN/slatm7.c
index e1b32ac9f..5ea2e32f6 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm7.c
+++ b/lapack-netlib/TESTING/MATGEN/slatm7.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,11 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
 static float spow_ui(float x, integer n) {
 	float pow=1.0; unsigned long int u;
@@ -279,235 +273,6 @@ static float spow_ui(float x, integer n) {
 	}
 	return pow;
 }
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* > \brief \b SLATM7 */
diff --git a/lapack-netlib/TESTING/MATGEN/slatme.c b/lapack-netlib/TESTING/MATGEN/slatme.c
index a8a6b39a3..126c42121 100644
--- a/lapack-netlib/TESTING/MATGEN/slatme.c
+++ b/lapack-netlib/TESTING/MATGEN/slatme.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatmr.c b/lapack-netlib/TESTING/MATGEN/slatmr.c
index 01de66f54..cc227a9d8 100644
--- a/lapack-netlib/TESTING/MATGEN/slatmr.c
+++ b/lapack-netlib/TESTING/MATGEN/slatmr.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatms.c b/lapack-netlib/TESTING/MATGEN/slatms.c
index 8e85f5db8..95f005858 100644
--- a/lapack-netlib/TESTING/MATGEN/slatms.c
+++ b/lapack-netlib/TESTING/MATGEN/slatms.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/slatmt.c b/lapack-netlib/TESTING/MATGEN/slatmt.c
index e34bb8c03..cd1c3c7c1 100644
--- a/lapack-netlib/TESTING/MATGEN/slatmt.c
+++ b/lapack-netlib/TESTING/MATGEN/slatmt.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,253 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlagge.c b/lapack-netlib/TESTING/MATGEN/zlagge.c
index bfa33cf34..56d0318e2 100644
--- a/lapack-netlib/TESTING/MATGEN/zlagge.c
+++ b/lapack-netlib/TESTING/MATGEN/zlagge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlaghe.c b/lapack-netlib/TESTING/MATGEN/zlaghe.c
index 06a8e31a3..63a8a05d6 100644
--- a/lapack-netlib/TESTING/MATGEN/zlaghe.c
+++ b/lapack-netlib/TESTING/MATGEN/zlaghe.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlagsy.c b/lapack-netlib/TESTING/MATGEN/zlagsy.c
index 431522ebe..8804bfbce 100644
--- a/lapack-netlib/TESTING/MATGEN/zlagsy.c
+++ b/lapack-netlib/TESTING/MATGEN/zlagsy.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlahilb.c b/lapack-netlib/TESTING/MATGEN/zlahilb.c
index 0e9cc1c4a..ff3d06525 100644
--- a/lapack-netlib/TESTING/MATGEN/zlahilb.c
+++ b/lapack-netlib/TESTING/MATGEN/zlahilb.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlakf2.c b/lapack-netlib/TESTING/MATGEN/zlakf2.c
index f7a90eef7..4c08bd112 100644
--- a/lapack-netlib/TESTING/MATGEN/zlakf2.c
+++ b/lapack-netlib/TESTING/MATGEN/zlakf2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlarge.c b/lapack-netlib/TESTING/MATGEN/zlarge.c
index 4b06a818d..848cde007 100644
--- a/lapack-netlib/TESTING/MATGEN/zlarge.c
+++ b/lapack-netlib/TESTING/MATGEN/zlarge.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlarnd.c b/lapack-netlib/TESTING/MATGEN/zlarnd.c
index 014d48afb..ff1bf1467 100644
--- a/lapack-netlib/TESTING/MATGEN/zlarnd.c
+++ b/lapack-netlib/TESTING/MATGEN/zlarnd.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* > \brief \b ZLARND */
diff --git a/lapack-netlib/TESTING/MATGEN/zlaror.c b/lapack-netlib/TESTING/MATGEN/zlaror.c
index 6ada57b8a..c8a84f215 100644
--- a/lapack-netlib/TESTING/MATGEN/zlaror.c
+++ b/lapack-netlib/TESTING/MATGEN/zlaror.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlarot.c b/lapack-netlib/TESTING/MATGEN/zlarot.c
index 6b2e88b32..be951dcb5 100644
--- a/lapack-netlib/TESTING/MATGEN/zlarot.c
+++ b/lapack-netlib/TESTING/MATGEN/zlarot.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm1.c b/lapack-netlib/TESTING/MATGEN/zlatm1.c
index c75787730..447edc259 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm1.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm1.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,24 +260,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
 
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
 static double dpow_ui(double x, integer n) {
 	double pow=1.0; unsigned long int u;
 	if(n != 0) {
@@ -291,224 +273,6 @@ static double dpow_ui(double x, integer n) {
 	}
 	return pow;
 }
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm2.c b/lapack-netlib/TESTING/MATGEN/zlatm2.c
index e1689d875..e6fe75d06 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm2.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm2.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* > \brief \b ZLATM2 */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm3.c b/lapack-netlib/TESTING/MATGEN/zlatm3.c
index c35ffe4d9..6370a9d39 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm3.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm3.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* > \brief \b ZLATM3 */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm5.c b/lapack-netlib/TESTING/MATGEN/zlatm5.c
index 753ee0ce6..5ee6cc8ce 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm5.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm5.c
@@ -248,7 +248,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -262,254 +261,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm6.c b/lapack-netlib/TESTING/MATGEN/zlatm6.c
index e20ec7092..f394cb1df 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm6.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatm6.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatme.c b/lapack-netlib/TESTING/MATGEN/zlatme.c
index ded6ca214..11127aa51 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatme.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatme.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatmr.c b/lapack-netlib/TESTING/MATGEN/zlatmr.c
index c42a25f2b..02a4cf16f 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatmr.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatmr.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatms.c b/lapack-netlib/TESTING/MATGEN/zlatms.c
index f0bb66a44..87f26059d 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatms.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatms.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/MATGEN/zlatmt.c b/lapack-netlib/TESTING/MATGEN/zlatmt.c
index 551b17628..64c30ec17 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatmt.c
+++ b/lapack-netlib/TESTING/MATGEN/zlatmt.c
@@ -247,7 +247,6 @@ typedef struct Namelist Namelist;
 #define s_copy(A,B,C,D) { int __i,__m; for (__i=0, __m=f2cmin((C),(D)); __i<__m && (B)[__i] != 0; ++__i) (A)[__i] = (B)[__i]; }
 #define sig_die(s, kill) { exit(1); }
 #define s_stop(s, n) {exit(0);}
-static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 #define z_abs(z) (cabs(Cd(z)))
 #define z_exp(R, Z) {pCd(R) = cexp(Cd(Z));}
 #define z_sqrt(R, Z) {pCd(R) = csqrt(Cd(Z));}
@@ -261,254 +260,6 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n";
 /* procedure parameter types for -A and -C++ */
 
 #define F2C_proc_par_types 1
-#ifdef __cplusplus
-typedef logical (*L_fp)(...);
-#else
-typedef logical (*L_fp)();
-#endif
-
-static float spow_ui(float x, integer n) {
-	float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static double dpow_ui(double x, integer n) {
-	double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#ifdef _MSC_VER
-static _Fcomplex cpow_ui(complex x, integer n) {
-	complex pow={1.0,0.0}; unsigned long int u;
-		if(n != 0) {
-		if(n < 0) n = -n, x.r = 1/x.r, x.i=1/x.i;
-		for(u = n; ; ) {
-			if(u & 01) pow.r *= x.r, pow.i *= x.i;
-			if(u >>= 1) x.r *= x.r, x.i *= x.i;
-			else break;
-		}
-	}
-	_Fcomplex p={pow.r, pow.i};
-	return p;
-}
-#else
-static _Complex float cpow_ui(_Complex float x, integer n) {
-	_Complex float pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-#ifdef _MSC_VER
-static _Dcomplex zpow_ui(_Dcomplex x, integer n) {
-	_Dcomplex pow={1.0,0.0}; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x._Val[0] = 1/x._Val[0], x._Val[1] =1/x._Val[1];
-		for(u = n; ; ) {
-			if(u & 01) pow._Val[0] *= x._Val[0], pow._Val[1] *= x._Val[1];
-			if(u >>= 1) x._Val[0] *= x._Val[0], x._Val[1] *= x._Val[1];
-			else break;
-		}
-	}
-	_Dcomplex p = {pow._Val[0], pow._Val[1]};
-	return p;
-}
-#else
-static _Complex double zpow_ui(_Complex double x, integer n) {
-	_Complex double pow=1.0; unsigned long int u;
-	if(n != 0) {
-		if(n < 0) n = -n, x = 1/x;
-		for(u = n; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-#endif
-static integer pow_ii(integer x, integer n) {
-	integer pow; unsigned long int u;
-	if (n <= 0) {
-		if (n == 0 || x == 1) pow = 1;
-		else if (x != -1) pow = x == 0 ? 1/x : 0;
-		else n = -n;
-	}
-	if ((n > 0) || !(n == 0 || x == 1 || x != -1)) {
-		u = n;
-		for(pow = 1; ; ) {
-			if(u & 01) pow *= x;
-			if(u >>= 1) x *= x;
-			else break;
-		}
-	}
-	return pow;
-}
-static integer dmaxloc_(double *w, integer s, integer e, integer *n)
-{
-	double m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static integer smaxloc_(float *w, integer s, integer e, integer *n)
-{
-	float m; integer i, mi;
-	for(m=w[s-1], mi=s, i=s+1; i<=e; i++)
-		if (w[i-1]>m) mi=i ,m=w[i-1];
-	return mi-s+1;
-}
-static inline void cdotc_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i]))._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i]))._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conjf(Cf(&x[i*incx]))._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conjf(Cf(&x[i*incx]))._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i])) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conjf(Cf(&x[i*incx])) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotc_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i]))._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i]))._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += conj(Cd(&x[i*incx]))._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += conj(Cd(&x[i*incx]))._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i])) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += conj(Cd(&x[i*incx])) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif	
-static inline void cdotu_(complex *z, integer *n_, complex *x, integer *incx_, complex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Fcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i])._Val[0] * Cf(&y[i])._Val[0];
-			zdotc._Val[1] += Cf(&x[i])._Val[1] * Cf(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cf(&x[i*incx])._Val[0] * Cf(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cf(&x[i*incx])._Val[1] * Cf(&y[i*incy])._Val[1];
-		}
-	}
-	pCf(z) = zdotc;
-}
-#else
-	_Complex float zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i]) * Cf(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cf(&x[i*incx]) * Cf(&y[i*incy]);
-		}
-	}
-	pCf(z) = zdotc;
-}
-#endif
-static inline void zdotu_(doublecomplex *z, integer *n_, doublecomplex *x, integer *incx_, doublecomplex *y, integer *incy_) {
-	integer n = *n_, incx = *incx_, incy = *incy_, i;
-#ifdef _MSC_VER
-	_Dcomplex zdotc = {0.0, 0.0};
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i])._Val[0] * Cd(&y[i])._Val[0];
-			zdotc._Val[1] += Cd(&x[i])._Val[1] * Cd(&y[i])._Val[1];
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc._Val[0] += Cd(&x[i*incx])._Val[0] * Cd(&y[i*incy])._Val[0];
-			zdotc._Val[1] += Cd(&x[i*incx])._Val[1] * Cd(&y[i*incy])._Val[1];
-		}
-	}
-	pCd(z) = zdotc;
-}
-#else
-	_Complex double zdotc = 0.0;
-	if (incx == 1 && incy == 1) {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i]) * Cd(&y[i]);
-		}
-	} else {
-		for (i=0;i<n;i++) { /* zdotc = zdotc + dconjg(x(i))* y(i) */
-			zdotc += Cd(&x[i*incx]) * Cd(&y[i*incy]);
-		}
-	}
-	pCd(z) = zdotc;
-}
-#endif
-/*  -- translated by f2c (version 20000121).
-   You must link the resulting object file with the libraries:
-	-lf2c -lm   (in that order)
-*/
-
-
-
 
 
 /* Table of constant values */
diff --git a/lapack-netlib/TESTING/Makefile b/lapack-netlib/TESTING/Makefile
index bdea2bfaa..3963260ac 100644
--- a/lapack-netlib/TESTING/Makefile
+++ b/lapack-netlib/TESTING/Makefile
@@ -61,6 +61,8 @@ SEIGTST= snep.out \
          scsd.out \
          slse.out
 
+SDMDEIGTST= sdmd.out
+
 CEIGTST= cnep.out \
          csep.out \
          cse2.out \
@@ -82,6 +84,8 @@ CEIGTST= cnep.out \
          ccsd.out \
          clse.out
 
+CDMDEIGTST= cdmd.out
+
 DEIGTST= dnep.out \
          dsep.out \
          dse2.out \
@@ -103,6 +107,8 @@ DEIGTST= dnep.out \
          dcsd.out \
          dlse.out
 
+DDMDEIGTST= ddmd.out
+
 ZEIGTST= znep.out \
          zsep.out \
          zse2.out \
@@ -124,6 +130,7 @@ ZEIGTST= znep.out \
          zcsd.out \
          zlse.out
 
+ZDMDEIGTST= zdmd.out
 
 SLINTST= stest.out
 
@@ -142,10 +149,10 @@ ZLINTST= ztest.out
 ZLINTSTPROTO= zctest.out ztest_rfp.out
 
 .PHONY: single complex double complex16
-single:         $(SLINTST) $(SEIGTST)
-complex:        $(CLINTST) $(CEIGTST)
-double:         $(DLINTST) $(DEIGTST)
-complex16:      $(ZLINTST) $(ZEIGTST)
+single:         $(SLINTST) $(SEIGTST) $(SDMDEIGTST)
+complex:        $(CLINTST) $(CEIGTST) $(CDMDEIGTST)
+double:         $(DLINTST) $(DEIGTST) $(DDMDEIGTST)
+complex16:      $(ZLINTST) $(ZEIGTST) $(ZDMDEIGTST)
 
 .PHONY: singleproto complexproto doubleproto complex16proto
 singleproto:    $(SLINTSTPROTO)
@@ -297,6 +304,10 @@ scsd.out: csd.in EIG/xeigtsts
 slse.out: lse.in EIG/xeigtsts
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtsts < lse.in > $@ 2>&1
+
+sdmd.out: sdmd.in EIG/xdmdeigtsts
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtsts < sdmd.in > $@ 2>&1
 #
 # ======== COMPLEX EIG TESTS ===========================
 
@@ -379,6 +390,10 @@ ccsd.out: csd.in EIG/xeigtstc
 clse.out: lse.in EIG/xeigtstc
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtstc < lse.in > $@ 2>&1
+
+cdmd.out: cdmd.in EIG/xdmdeigtstc
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtstc < cdmd.in > $@ 2>&1
 #
 # ======== DOUBLE EIG TESTS ===========================
 
@@ -461,6 +476,10 @@ dcsd.out: csd.in EIG/xeigtstd
 dlse.out: lse.in EIG/xeigtstd
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtstd < lse.in > $@ 2>&1
+
+ddmd.out: ddmd.in EIG/xdmdeigtstd
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtstd < ddmd.in > $@ 2>&1
 #
 # ======== COMPLEX16 EIG TESTS ===========================
 
@@ -543,6 +562,10 @@ zcsd.out: csd.in EIG/xeigtstz
 zlse.out: lse.in EIG/xeigtstz
 	@echo LSE: Testing Constrained Linear Least Squares routines
 	./EIG/xeigtstz < lse.in > $@ 2>&1
+
+zdmd.out: zdmd.in EIG/xdmdeigtstz
+	@echo DMD: Testing Dynamic Mode Decomposition routines
+	./EIG/xdmdeigtstz < zdmd.in > $@ 2>&1
 # ==============================================================================
 
 LIN/xlintsts: $(FRCLIN) $(FRC)
@@ -578,15 +601,27 @@ LIN/xlintstzc: $(FRCLIN) $(FRC)
 EIG/xeigtsts: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtsts
 
+EIG/xdmdeigtsts: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtsts
+
 EIG/xeigtstc: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstc
 
+EIG/xdmdeigtstc: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtstc
+
 EIG/xeigtstd: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstd
 
+EIG/xdmdeigtstd: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtstd
+
 EIG/xeigtstz: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstz
 
+EIG/xdmdeigtstz: $(FRCEIG) $(FRC)
+	$(MAKE) -C EIG xdmdeigtstz
+
 .PHONY: clean cleantest
 clean: cleantest
 cleantest:
diff --git a/lapack-netlib/TESTING/cdmd.in b/lapack-netlib/TESTING/cdmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/cdmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/ctest.in b/lapack-netlib/TESTING/ctest.in
index a3588b4a1..74ff31ab8 100644
--- a/lapack-netlib/TESTING/ctest.in
+++ b/lapack-netlib/TESTING/ctest.in
@@ -42,6 +42,7 @@ CRQ    8               List types on next line if 0 < NTYPES <  8
 CLQ    8               List types on next line if 0 < NTYPES <  8
 CQL    8               List types on next line if 0 < NTYPES <  8
 CQP    6               List types on next line if 0 < NTYPES <  6
+CQK   19               List types on next line if 0 < NTYPES < 19
 CTZ    3               List types on next line if 0 < NTYPES <  3
 CLS    6               List types on next line if 0 < NTYPES <  6
 CEQ
diff --git a/lapack-netlib/TESTING/ddmd.in b/lapack-netlib/TESTING/ddmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/ddmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/dgd.in b/lapack-netlib/TESTING/dgd.in
index ca24dd2a0..127b63789 100644
--- a/lapack-netlib/TESTING/dgd.in
+++ b/lapack-netlib/TESTING/dgd.in
@@ -1,6 +1,6 @@
 DGS               Data for the Real Nonsymmetric Schur Form Driver
 5                 Number of matrix dimensions
-2 6 10 12 20 30   Matrix dimensions
+2 10 12 20 30     Matrix dimensions
 1 1 1 2 1         Parameters NB, NBMIN, NXOVER, NS, NBCOL
 10                Threshold for test ratios
 .TRUE.            Put T to test the error exits
diff --git a/lapack-netlib/TESTING/dtest.in b/lapack-netlib/TESTING/dtest.in
index 29bb8b92e..1b6c7bd4a 100644
--- a/lapack-netlib/TESTING/dtest.in
+++ b/lapack-netlib/TESTING/dtest.in
@@ -36,6 +36,7 @@ DRQ    8               List types on next line if 0 < NTYPES <  8
 DLQ    8               List types on next line if 0 < NTYPES <  8
 DQL    8               List types on next line if 0 < NTYPES <  8
 DQP    6               List types on next line if 0 < NTYPES <  6
+DQK   19               LIst types on next line if 0 < NTYPES < 19
 DTZ    3               List types on next line if 0 < NTYPES <  3
 DLS    6               List types on next line if 0 < NTYPES <  6
 DEQ
diff --git a/lapack-netlib/TESTING/sdmd.in b/lapack-netlib/TESTING/sdmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/sdmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/sgd.in b/lapack-netlib/TESTING/sgd.in
index 164b3cc35..efab6c257 100644
--- a/lapack-netlib/TESTING/sgd.in
+++ b/lapack-netlib/TESTING/sgd.in
@@ -1,6 +1,6 @@
 SGS               Data for the Real Nonsymmetric Schur Form Driver
 5                 Number of matrix dimensions
-2 6 10 12 20 30   Matrix dimensions
+2 10 12 20 30     Matrix dimensions
 1 1 1 2 1         Parameters NB, NBMIN, NXOVER, NS, NBCOL
 10                Threshold for test ratios
 .TRUE.            Put T to test the error exits
diff --git a/lapack-netlib/TESTING/stest.in b/lapack-netlib/TESTING/stest.in
index 27ac30040..7faa8b7a1 100644
--- a/lapack-netlib/TESTING/stest.in
+++ b/lapack-netlib/TESTING/stest.in
@@ -36,6 +36,7 @@ SRQ    8               List types on next line if 0 < NTYPES <  8
 SLQ    8               List types on next line if 0 < NTYPES <  8
 SQL    8               List types on next line if 0 < NTYPES <  8
 SQP    6               List types on next line if 0 < NTYPES <  6
+SQK   19               List types on next line if 0 < NTYPES < 19
 STZ    3               List types on next line if 0 < NTYPES <  3
 SLS    6               List types on next line if 0 < NTYPES <  6
 SEQ
diff --git a/lapack-netlib/TESTING/zdmd.in b/lapack-netlib/TESTING/zdmd.in
new file mode 100644
index 000000000..42d046e01
--- /dev/null
+++ b/lapack-netlib/TESTING/zdmd.in
@@ -0,0 +1,11 @@
+10
+5
+
+20
+10
+
+30
+11
+
+50
+20
diff --git a/lapack-netlib/TESTING/ztest.in b/lapack-netlib/TESTING/ztest.in
index 58da33d60..c83e82e45 100644
--- a/lapack-netlib/TESTING/ztest.in
+++ b/lapack-netlib/TESTING/ztest.in
@@ -42,6 +42,7 @@ ZRQ    8               List types on next line if 0 < NTYPES <  8
 ZLQ    8               List types on next line if 0 < NTYPES <  8
 ZQL    8               List types on next line if 0 < NTYPES <  8
 ZQP    6               List types on next line if 0 < NTYPES <  6
+ZQK   19               List types on next line if 0 < NTYPES < 19
 ZTZ    3               List types on next line if 0 < NTYPES <  3
 ZLS    6               List types on next line if 0 < NTYPES <  6
 ZEQ
diff --git a/lapack-netlib/cgbsvx.f b/lapack-netlib/cgbsvx.f
new file mode 100644
index 000000000..eaab5682c
--- /dev/null
+++ b/lapack-netlib/cgbsvx.f
@@ -0,0 +1,644 @@
+*> \brief <b> CGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX            AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGBSVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is COMPLEX array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by CGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by CGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(1,N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGBsolve
+*
+*  =====================================================================
+      SUBROUTINE CGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX            AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*  Moved setting of INFO = N+1 so INFO does not subsequently get
+*  overwritten.  Sven, 17 Mar 05.
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               CLANGB, CLANTB, SLAMCH
+      EXTERNAL           LSAME, CLANGB, CLANTB, SLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CGBCON, CGBEQU, CGBRFS, CGBTRF, CGBTRS,
+     $                   CLACPY, CLAQGB, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL CGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL CLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL CCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL CGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = CLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = CLANGB( NORM, N, KL, KU, AB, LDAB, RWORK )
+      RPVGRW = CLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = CLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL CGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL CGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL CGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of CGBSVX
+*
+      END
diff --git a/lapack-netlib/cgejsv.f b/lapack-netlib/cgejsv.f
new file mode 100644
index 000000000..51a6cee4e
--- /dev/null
+++ b/lapack-netlib/cgejsv.f
@@ -0,0 +1,2232 @@
+*> \brief \b CGEJSV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGEJSV + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgejsv.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgejsv.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgejsv.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+*                         M, N, A, LDA, SVA, U, LDU, V, LDV,
+*                         CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*     IMPLICIT    NONE
+*     INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*     ..
+*     .. Array Arguments ..
+*     COMPLEX     A( LDA, * ),  U( LDU, * ), V( LDV, * ), CWORK( LWORK )
+*     REAL        SVA( N ), RWORK( LRWORK )
+*     INTEGER     IWORK( * )
+*     CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGEJSV computes the singular value decomposition (SVD) of a complex M-by-N
+*> matrix [A], where M >= N. The SVD of [A] is written as
+*>
+*>              [A] = [U] * [SIGMA] * [V]^*,
+*>
+*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+*> the singular values of [A]. The columns of [U] and [V] are the left and
+*> the right singular vectors of [A], respectively. The matrices [U] and [V]
+*> are computed and stored in the arrays U and V, respectively. The diagonal
+*> of [SIGMA] is computed and stored in the array SVA.
+*> \endverbatim
+*>
+*>  Arguments:
+*>  ==========
+*>
+*> \param[in] JOBA
+*> \verbatim
+*>          JOBA is CHARACTER*1
+*>         Specifies the level of accuracy:
+*>       = 'C': This option works well (high relative accuracy) if A = B * D,
+*>              with well-conditioned B and arbitrary diagonal matrix D.
+*>              The accuracy cannot be spoiled by COLUMN scaling. The
+*>              accuracy of the computed output depends on the condition of
+*>              B, and the procedure aims at the best theoretical accuracy.
+*>              The relative error max_{i=1:N}|d sigma_i| / sigma_i is
+*>              bounded by f(M,N)*epsilon* cond(B), independent of D.
+*>              The input matrix is preprocessed with the QRF with column
+*>              pivoting. This initial preprocessing and preconditioning by
+*>              a rank revealing QR factorization is common for all values of
+*>              JOBA. Additional actions are specified as follows:
+*>       = 'E': Computation as with 'C' with an additional estimate of the
+*>              condition number of B. It provides a realistic error bound.
+*>       = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings
+*>              D1, D2, and well-conditioned matrix C, this option gives
+*>              higher accuracy than the 'C' option. If the structure of the
+*>              input matrix is not known, and relative accuracy is
+*>              desirable, then this option is advisable. The input matrix A
+*>              is preprocessed with QR factorization with FULL (row and
+*>              column) pivoting.
+*>       = 'G': Computation as with 'F' with an additional estimate of the
+*>              condition number of B, where A=B*D. If A has heavily weighted
+*>              rows, then using this condition number gives too pessimistic
+*>              error bound.
+*>       = 'A': Small singular values are not well determined by the data 
+*>              and are considered as noisy; the matrix is treated as
+*>              numerically rank deficient. The error in the computed
+*>              singular values is bounded by f(m,n)*epsilon*||A||.
+*>              The computed SVD A = U * S * V^* restores A up to
+*>              f(m,n)*epsilon*||A||.
+*>              This gives the procedure the licence to discard (set to zero)
+*>              all singular values below N*epsilon*||A||.
+*>       = 'R': Similar as in 'A'. Rank revealing property of the initial
+*>              QR factorization is used do reveal (using triangular factor)
+*>              a gap sigma_{r+1} < epsilon * sigma_r in which case the
+*>              numerical RANK is declared to be r. The SVD is computed with
+*>              absolute error bounds, but more accurately than with 'A'.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>         Specifies whether to compute the columns of U:
+*>       = 'U': N columns of U are returned in the array U.
+*>       = 'F': full set of M left sing. vectors is returned in the array U.
+*>       = 'W': U may be used as workspace of length M*N. See the description
+*>              of U.
+*>       = 'N': U is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>         Specifies whether to compute the matrix V:
+*>       = 'V': N columns of V are returned in the array V; Jacobi rotations
+*>              are not explicitly accumulated.
+*>       = 'J': N columns of V are returned in the array V, but they are
+*>              computed as the product of Jacobi rotations, if JOBT = 'N'.
+*>       = 'W': V may be used as workspace of length N*N. See the description
+*>              of V.
+*>       = 'N': V is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>         Specifies the RANGE for the singular values. Issues the licence to
+*>         set to zero small positive singular values if they are outside
+*>         specified range. If A .NE. 0 is scaled so that the largest singular
+*>         value of c*A is around SQRT(BIG), BIG=SLAMCH('O'), then JOBR issues
+*>         the licence to kill columns of A whose norm in c*A is less than
+*>         SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
+*>         where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E').
+*>       = 'N': Do not kill small columns of c*A. This option assumes that
+*>              BLAS and QR factorizations and triangular solvers are
+*>              implemented to work in that range. If the condition of A
+*>              is greater than BIG, use CGESVJ.
+*>       = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)]
+*>              (roughly, as described above). This option is recommended.
+*>                                             ===========================
+*>         For computing the singular values in the FULL range [SFMIN,BIG]
+*>         use CGESVJ.
+*> \endverbatim
+*>
+*> \param[in] JOBT
+*> \verbatim
+*>          JOBT is CHARACTER*1
+*>         If the matrix is square then the procedure may determine to use
+*>         transposed A if A^* seems to be better with respect to convergence.
+*>         If the matrix is not square, JOBT is ignored.
+*>         The decision is based on two values of entropy over the adjoint
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
+*>       = 'T': transpose if entropy test indicates possibly faster
+*>         convergence of Jacobi process if A^* is taken as input. If A is
+*>         replaced with A^*, then the row pivoting is included automatically.
+*>       = 'N': do not speculate.
+*>         The option 'T' can be used to compute only the singular values, or
+*>         the full SVD (U, SIGMA and V). For only one set of singular vectors
+*>         (U or V), the caller should provide both U and V, as one of the
+*>         matrices is used as workspace if the matrix A is transposed.
+*>         The implementer can easily remove this constraint and make the
+*>         code more complicated. See the descriptions of U and V.
+*>         In general, this option is considered experimental, and 'N'; should
+*>         be preferred. This is subject to changes in the future.
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>          JOBP is CHARACTER*1
+*>         Issues the licence to introduce structured perturbations to drown
+*>         denormalized numbers. This licence should be active if the
+*>         denormals are poorly implemented, causing slow computation,
+*>         especially in cases of fast convergence (!). For details see [1,2].
+*>         For the sake of simplicity, this perturbations are included only
+*>         when the full SVD or only the singular values are requested. The
+*>         implementer/user can easily add the perturbation for the cases of
+*>         computing one set of singular vectors.
+*>       = 'P': introduce perturbation
+*>       = 'N': do not perturb
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>         The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>         The number of columns of the input matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] SVA
+*> \verbatim
+*>          SVA is REAL array, dimension (N)
+*>          On exit,
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
+*>            iterated matrices in the array A.
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
+*>            sigma_max(A) overflows or if small singular values have been
+*>            saved from underflow by scaling the input matrix A.
+*>          - If JOBR='R' then some of the singular values may be returned
+*>            as exact zeros obtained by "set to zero" because they are
+*>            below the numerical rank threshold or are denormalized numbers.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is COMPLEX array, dimension ( LDU, N ) or ( LDU, M )
+*>          If JOBU = 'U', then U contains on exit the M-by-N matrix of
+*>                         the left singular vectors.
+*>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
+*>                         the left singular vectors, including an ONB
+*>                         of the orthogonal complement of the Range(A).
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
+*>                         then U is used as workspace if the procedure
+*>                         replaces A with A^*. In that case, [V] is computed
+*>                         in U as left singular vectors of A^* and then
+*>                         copied back to the V array. This 'W' option is just
+*>                         a reminder to the caller that in this case U is
+*>                         reserved as workspace of length N*N.
+*>          If JOBU = 'N'  U is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER
+*>          The leading dimension of the array U,  LDU >= 1.
+*>          IF  JOBU = 'U' or 'F' or 'W',  then LDU >= M.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is COMPLEX array, dimension ( LDV, N )
+*>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
+*>                         the right singular vectors;
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
+*>                         then V is used as workspace if the pprocedure
+*>                         replaces A with A^*. In that case, [U] is computed
+*>                         in V as right singular vectors of A^* and then
+*>                         copied back to the U array. This 'W' option is just
+*>                         a reminder to the caller that in this case V is
+*>                         reserved as workspace of length N*N.
+*>          If JOBV = 'N'  V is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V,  LDV >= 1.
+*>          If JOBV = 'V' or 'J' or 'W', then LDV >= N.
+*> \endverbatim
+*>
+*> \param[out] CWORK
+*> \verbatim
+*>          CWORK is COMPLEX array, dimension (MAX(2,LWORK))
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the required length of 
+*>          CWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          Length of CWORK to confirm proper allocation of workspace.
+*>          LWORK depends on the job:
+*>
+*>          1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and
+*>            1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'):
+*>               LWORK >= 2*N+1. This is the minimal requirement.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= N + (N+1)*NB. Here NB is the optimal
+*>               block size for CGEQP3 and CGEQRF.
+*>               In general, optimal LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ)).        
+*>            1.2. .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G'). In this case, LWORK the minimal
+*>               requirement is LWORK >= N*N + 2*N.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ),
+*>                            N*N+LWORK(CPOCON)).
+*>          2. If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>             (JOBU = 'N')
+*>            2.1   .. no scaled condition estimate requested (JOBE = 'N'):    
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF,
+*>               CUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3), N+LWORK(CGESVJ),
+*>                       N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)).
+*>            2.2 .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.      
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CGELQF,
+*>               CUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3), LWORK(CPOCON), N+LWORK(CGESVJ),
+*>                       N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)).   
+*>          3. If SIGMA and the left singular vectors are needed
+*>            3.1  .. no scaled condition estimate requested (JOBE = 'N'):
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3), 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). 
+*>            3.2  .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CPOCON),
+*>                        2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)).
+*>
+*>          4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and
+*>            4.1. if JOBV = 'V'
+*>               the minimal requirement is LWORK >= 5*N+2*N*N.
+*>            4.2. if JOBV = 'J' the minimal requirement is
+*>               LWORK >= 4*N+N*N.
+*>            In both cases, the allocated CWORK can accommodate blocked runs
+*>            of CGEQP3, CGEQRF, CGELQF, CUNMQR, CUNMLQ.
+*> 
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the
+*>          minimal length of CWORK for the job parameters used in the call.        
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(7,LRWORK))
+*>          On exit,
+*>          RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1)
+*>                    such that SCALE*SVA(1:N) are the computed singular values
+*>                    of A. (See the description of SVA().)
+*>          RWORK(2) = See the description of RWORK(1).
+*>          RWORK(3) = SCONDA is an estimate for the condition number of
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
+*>                    SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*>                    It is computed using CPOCON. It holds
+*>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*>                    where R is the triangular factor from the QRF of A.
+*>                    However, if R is truncated and the numerical rank is
+*>                    determined to be strictly smaller than N, SCONDA is
+*>                    returned as -1, thus indicating that the smallest
+*>                    singular values might be lost.
+*>
+*>          If full SVD is needed, the following two condition numbers are
+*>          useful for the analysis of the algorithm. They are provided for
+*>          a developer/implementer who is familiar with the details of
+*>          the method.
+*>
+*>          RWORK(4) = an estimate of the scaled condition number of the
+*>                    triangular factor in the first QR factorization.
+*>          RWORK(5) = an estimate of the scaled condition number of the
+*>                    triangular factor in the second QR factorization.
+*>          The following two parameters are computed if JOBT = 'T'.
+*>          They are provided for a developer/implementer who is familiar
+*>          with the details of the method.
+*>          RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy
+*>                    of diag(A^* * A) / Trace(A^* * A) taken as point in the
+*>                    probability simplex.
+*>          RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).)
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit RWORK(1) contains the required length of
+*>          RWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          Length of RWORK to confirm proper allocation of workspace.
+*>          LRWORK depends on the job:
+*>
+*>       1. If only the singular values are requested i.e. if
+*>          LSAME(JOBU,'N') .AND. LSAME(JOBV,'N')
+*>          then:
+*>          1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>               then: LRWORK = max( 7, 2 * M ).
+*>          1.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       2. If singular values with the right singular vectors are requested
+*>          i.e. if
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND.
+*>          .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F'))
+*>          then:
+*>          2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          2.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       3. If singular values with the left singular vectors are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          3.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       4. If singular values with both the left and the right singular vectors
+*>          are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          4.2. Otherwise, LRWORK  = max( 7, N ).
+*> 
+*>          If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and 
+*>          the length of RWORK is returned in RWORK(1). 
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, of dimension at least 4, that further depends
+*>          on the job:
+*> 
+*>          1. If only the singular values are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N.
+*>          2. If the singular values and the right singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          3. If the singular values and the left singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          4. If the singular values with both the left and the right singular vectors
+*>             are requested, then:      
+*>             4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>             4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N.
+*>        
+*>          On exit,
+*>          IWORK(1) = the numerical rank determined after the initial
+*>                     QR factorization with pivoting. See the descriptions
+*>                     of JOBA and JOBR.
+*>          IWORK(2) = the number of the computed nonzero singular values
+*>          IWORK(3) = if nonzero, a warning message:
+*>                     If IWORK(3) = 1 then some of the column norms of A
+*>                     were denormalized floats. The requested high accuracy
+*>                     is not warranted by the data.
+*>          IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to
+*>                     do the job as specified by the JOB parameters.
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK = -1 and 
+*>          LRWORK = -1), then on exit IWORK(1) contains the required length of 
+*>          IWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  CGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGEsing
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>  CGEJSV implements a preconditioned Jacobi SVD algorithm. It uses CGEQP3,
+*>  CGEQRF, and CGELQF as preprocessors and preconditioners. Optionally, an
+*>  additional row pivoting can be used as a preprocessor, which in some
+*>  cases results in much higher accuracy. An example is matrix A with the
+*>  structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned
+*>  diagonal matrices and C is well-conditioned matrix. In that case, complete
+*>  pivoting in the first QR factorizations provides accuracy dependent on the
+*>  condition number of C, and independent of D1, D2. Such higher accuracy is
+*>  not completely understood theoretically, but it works well in practice.
+*>  Further, if A can be written as A = B*D, with well-conditioned B and some
+*>  diagonal D, then the high accuracy is guaranteed, both theoretically and
+*>  in software, independent of D. For more details see [1], [2].
+*>     The computational range for the singular values can be the full range
+*>  ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS
+*>  & LAPACK routines called by CGEJSV are implemented to work in that range.
+*>  If that is not the case, then the restriction for safe computation with
+*>  the singular values in the range of normalized IEEE numbers is that the
+*>  spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not
+*>  overflow. This code (CGEJSV) is best used in this restricted range,
+*>  meaning that singular values of magnitude below ||A||_2 / SLAMCH('O') are
+*>  returned as zeros. See JOBR for details on this.
+*>     Further, this implementation is somewhat slower than the one described
+*>  in [1,2] due to replacement of some non-LAPACK components, and because
+*>  the choice of some tuning parameters in the iterative part (CGESVJ) is
+*>  left to the implementer on a particular machine.
+*>     The rank revealing QR factorization (in this code: CGEQP3) should be
+*>  implemented as in [3]. We have a new version of CGEQP3 under development
+*>  that is more robust than the current one in LAPACK, with a cleaner cut in
+*>  rank deficient cases. It will be available in the SIGMA library [4].
+*>  If M is much larger than N, it is obvious that the initial QRF with
+*>  column pivoting can be preprocessed by the QRF without pivoting. That
+*>  well known trick is not used in CGEJSV because in some cases heavy row
+*>  weighting can be treated with complete pivoting. The overhead in cases
+*>  M much larger than N is then only due to pivoting, but the benefits in
+*>  terms of accuracy have prevailed. The implementer/user can incorporate
+*>  this extra QRF step easily. The implementer can also improve data movement
+*>  (matrix transpose, matrix copy, matrix transposed copy) - this
+*>  implementation of CGEJSV uses only the simplest, naive data movement.
+*> \endverbatim
+*
+*> \par Contributor:
+*  ==================
+*>
+*>  Zlatko Drmac (Zagreb, Croatia)
+*
+*> \par References:
+*  ================
+*>
+*> \verbatim
+*>
+*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342.
+*>     LAPACK Working note 169.
+*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362.
+*>     LAPACK Working note 170.
+*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR
+*>     factorization software - a case study.
+*>     ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28.
+*>     LAPACK Working note 176.
+*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV,
+*>     QSVD, (H,K)-SVD computations.
+*>     Department of Mathematics, University of Zagreb, 2008, 2016.
+*> \endverbatim
+*
+*>  \par Bugs, examples and comments:
+*   =================================
+*>
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*>
+*  =====================================================================
+      SUBROUTINE CGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+     $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
+     $                   CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      INTEGER     INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX     A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( LWORK )
+      REAL        SVA( N ), RWORK( LRWORK )
+      INTEGER     IWORK( * )
+      CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*     ..
+*
+*  ===========================================================================
+*
+*     .. Local Parameters ..
+      REAL        ZERO,         ONE
+      PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+      COMPLEX     CZERO,                    CONE
+      PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ), CONE = ( 1.0E0, 0.0E0 ) )
+*     ..
+*     .. Local Scalars ..
+      COMPLEX CTEMP
+      REAL    AAPP,   AAQQ,   AATMAX, AATMIN, BIG,    BIG1,   COND_OK,
+     $        CONDR1, CONDR2, ENTRA,  ENTRAT, EPSLN,  MAXPRJ, SCALEM,
+     $        SCONDA, SFMIN,  SMALL,  TEMP1,  USCAL1, USCAL2, XSC
+      INTEGER IERR,   N1,     NR,     NUMRANK,        p, q,   WARNING
+      LOGICAL ALMORT, DEFR,   ERREST, GOSCAL,  JRACC,  KILL,   LQUERY,
+     $        LSVEC,  L2ABER, L2KILL, L2PERT,  L2RANK, L2TRAN, NOSCAL,
+     $        ROWPIV, RSVEC,  TRANSP
+*
+      INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK
+      INTEGER LWCON,  LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM,
+     $        LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF
+      INTEGER LWRK_CGELQF, LWRK_CGEQP3,  LWRK_CGEQP3N, LWRK_CGEQRF,  
+     $        LWRK_CGESVJ, LWRK_CGESVJV, LWRK_CGESVJU, LWRK_CUNMLQ, 
+     $        LWRK_CUNMQR, LWRK_CUNMQRM    
+*     ..
+*     .. Local Arrays
+      COMPLEX CDUMMY(1)
+      REAL    RDUMMY(1)
+*
+*     .. Intrinsic Functions ..
+      INTRINSIC ABS, CMPLX, CONJG, ALOG, MAX, MIN, REAL, NINT, SQRT
+*     ..
+*     .. External Functions ..
+      REAL      SLAMCH, SCNRM2
+      INTEGER   ISAMAX, ICAMAX
+      LOGICAL   LSAME
+      EXTERNAL  ISAMAX, ICAMAX, LSAME, SLAMCH, SCNRM2
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL  SLASSQ, CCOPY,  CGELQF, CGEQP3, CGEQRF, CLACPY, CLAPMR,
+     $          CLASCL, SLASCL, CLASET, CLASSQ, CLASWP, CUNGQR, CUNMLQ,
+     $          CUNMQR, CPOCON, SSCAL,  CSSCAL, CSWAP,  CTRSM,  CLACGV,
+     $          XERBLA
+*
+      EXTERNAL  CGESVJ
+*     ..
+*
+*     Test the input arguments
+*
+      LSVEC  = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' )
+      JRACC  = LSAME( JOBV, 'J' )
+      RSVEC  = LSAME( JOBV, 'V' ) .OR. JRACC
+      ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' )
+      L2RANK = LSAME( JOBA, 'R' )
+      L2ABER = LSAME( JOBA, 'A' )
+      ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' )
+      L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N )
+      L2KILL = LSAME( JOBR, 'R' )
+      DEFR   = LSAME( JOBR, 'N' )
+      L2PERT = LSAME( JOBP, 'P' )
+*
+      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+*
+      IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR.
+     $     ERREST .OR. LSAME( JOBA, 'C' ) )) THEN
+         INFO = - 1
+      ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR.
+     $   ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 2
+      ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR.
+     $   ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 3
+      ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) )    THEN
+         INFO = - 4
+      ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN
+         INFO = - 5
+      ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN
+         INFO = - 6
+      ELSE IF ( M .LT. 0 ) THEN
+         INFO = - 7
+      ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN
+         INFO = - 8
+      ELSE IF ( LDA .LT. M ) THEN
+         INFO = - 10
+      ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN
+         INFO = - 13
+      ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN
+         INFO = - 15
+      ELSE
+*        #:)
+         INFO = 0
+      END IF
+*
+      IF ( INFO .EQ. 0 ) THEN 
+*         .. compute the minimal and the optimal workspace lengths 
+*         [[The expressions for computing the minimal and the optimal
+*         values of LCWORK, LRWORK are written with a lot of redundancy and
+*         can be simplified. However, this verbose form is useful for
+*         maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for CGEQP3 of an M x N matrix,
+*         CGEQRF of an N x N matrix, CGELQF of an N x N matrix,
+*         CUNMLQ for computing N x N matrix, CUNMQR for computing N x N
+*         matrix, CUNMQR for computing M x N matrix, respectively.
+          LWQP3 = N+1   
+          LWQRF = MAX( 1, N )
+          LWLQF = MAX( 1, N )
+          LWUNMLQ  = MAX( 1, N )
+          LWUNMQR  = MAX( 1, N )
+          LWUNMQRM = MAX( 1, M )
+*        .. minimal workspace length for CPOCON of an N x N matrix
+          LWCON = 2 * N 
+*        .. minimal workspace length for CGESVJ of an N x N matrix,
+*         without and with explicit accumulation of Jacobi rotations
+          LWSVDJ  = MAX( 2 * N, 1 )         
+          LWSVDJV = MAX( 2 * N, 1 )
+*         .. minimal REAL workspace length for CGEQP3, CPOCON, CGESVJ
+          LRWQP3  = 2 * N 
+          LRWCON  = N 
+          LRWSVDJ = N 
+          IF ( LQUERY ) THEN 
+              CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, 
+     $             RDUMMY, IERR )
+              LWRK_CGEQP3 = INT( CDUMMY(1) )
+              CALL CGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_CGEQRF = INT( CDUMMY(1) )
+              CALL CGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_CGELQF = INT( CDUMMY(1) )
+          END IF
+          MINWRK  = 2
+          OPTWRK  = 2
+          MINIWRK = N 
+          IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN
+*             .. minimal and optimal sizes of the complex workspace if
+*             only the singular values are requested
+              IF ( ERREST ) THEN 
+                  MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ )
+              ELSE
+                  MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ )
+              END IF
+              IF ( LQUERY ) THEN 
+                  CALL CGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, 
+     $                 LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                  LWRK_CGESVJ = INT( CDUMMY(1) )
+                  IF ( ERREST ) THEN 
+                      OPTWRK = MAX( N+LWRK_CGEQP3, N**2+LWCON, 
+     $                              N+LWRK_CGEQRF, LWRK_CGESVJ )
+                  ELSE
+                      OPTWRK = MAX( N+LWRK_CGEQP3, N+LWRK_CGEQRF, 
+     $                              LWRK_CGESVJ )
+                  END IF
+              END IF
+              IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                  END IF                 
+              ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                  END IF
+              END IF   
+              IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M 
+          ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( ERREST ) THEN 
+                 MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF,  
+     $                         2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ )
+             ELSE
+                 MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, 
+     $                         N+LWSVDJ, N+LWUNMLQ )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_CGESVJ = INT( CDUMMY(1) )
+                 CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                V, LDV, CDUMMY, -1, IERR )
+                 LWRK_CUNMLQ = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN 
+                 OPTWRK = MAX( N+LWRK_CGEQP3, LWCON, LWRK_CGESVJ, 
+     $                         N+LWRK_CGELQF, 2*N+LWRK_CGEQRF,
+     $                         N+LWRK_CGESVJ,  N+LWRK_CUNMLQ )
+                 ELSE
+                 OPTWRK = MAX( N+LWRK_CGEQP3, LWRK_CGESVJ,N+LWRK_CGELQF,
+     $                         2*N+LWRK_CGEQRF, N+LWRK_CGESVJ, 
+     $                         N+LWRK_CUNMLQ )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ ) 
+                  END IF                  
+             ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) 
+                  END IF                 
+             END IF
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN  
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( ERREST ) THEN
+                 MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_CGESVJ = INT( CDUMMY(1) )
+                 CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $               LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQRM = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN
+                 OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, N+LWRK_CGEQRF,
+     $                             LWRK_CGESVJ, LWRK_CUNMQRM )
+                 ELSE
+                 OPTWRK = N + MAX( LWRK_CGEQP3, N+LWRK_CGEQRF,
+     $                             LWRK_CGESVJ, LWRK_CUNMQRM )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                 END IF                 
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                 END IF                
+             END IF 
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE
+*            .. minimal and optimal sizes of the complex workspace if the
+*            full SVD is requested
+             IF ( .NOT. JRACC ) THEN                
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON,  2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, 
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, 
+     $                         N+N**2+LWSVDJ,   N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3,        2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV,
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ,
+     $                         N+N**2+LWSVDJ,      N+LWUNMQRM ) 
+                 END IF 
+                 MINIWRK = MINIWRK + N 
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM ) 
+                 END IF   
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQRM = INT( CDUMMY(1) )
+                 CALL CUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQR = INT( CDUMMY(1) )
+                 IF ( .NOT. JRACC ) THEN
+                     CALL CGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1,
+     $                    RDUMMY, IERR )
+                     LWRK_CGEQP3N = INT( CDUMMY(1) )
+                     CALL CGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJ = INT( CDUMMY(1) )
+                     CALL CGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJU = INT( CDUMMY(1) )
+                     CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJV = INT( CDUMMY(1) )
+                     CALL CUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_CUNMLQ = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                       OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, 
+     $                          2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, 
+     $                          2*N+LWRK_CGEQP3N, 
+     $                          2*N+N**2+N+LWRK_CGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_CGESVJ, 
+     $                          2*N+N**2+N+LWRK_CGESVJV,               
+     $                          2*N+N**2+N+LWRK_CUNMQR,
+     $                          2*N+N**2+N+LWRK_CUNMLQ, 
+     $                          N+N**2+LWRK_CGESVJU,                  
+     $                          N+LWRK_CUNMQRM )
+                     ELSE
+                       OPTWRK = MAX( N+LWRK_CGEQP3,  
+     $                          2*N+N**2+LWCON, 2*N+LWRK_CGEQRF, 
+     $                          2*N+LWRK_CGEQP3N, 
+     $                          2*N+N**2+N+LWRK_CGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_CGESVJ,               
+     $                          2*N+N**2+N+LWRK_CGESVJV, 
+     $                          2*N+N**2+N+LWRK_CUNMQR,
+     $                          2*N+N**2+N+LWRK_CUNMLQ, 
+     $                          N+N**2+LWRK_CGESVJU,
+     $                          N+LWRK_CUNMQRM )
+                     END IF                    
+                 ELSE
+                     CALL CGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_CGESVJV = INT( CDUMMY(1) )
+                     CALL CUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_CUNMQR = INT( CDUMMY(1) )
+                     CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                    LDU, CDUMMY, -1, IERR )
+                     LWRK_CUNMQRM = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                        OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON,   
+     $                           2*N+LWRK_CGEQRF, 2*N+N**2,  
+     $                           2*N+N**2+LWRK_CGESVJV,  
+     $                           2*N+N**2+N+LWRK_CUNMQR,N+LWRK_CUNMQRM )
+                     ELSE
+                        OPTWRK = MAX( N+LWRK_CGEQP3, 2*N+LWRK_CGEQRF,  
+     $                           2*N+N**2, 2*N+N**2+LWRK_CGESVJV, 
+     $                           2*N+N**2+N+LWRK_CUNMQR, 
+     $                           N+LWRK_CUNMQRM )   
+                     END IF                  
+                 END IF               
+             END IF 
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+             ELSE
+                 MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+             END IF 
+          END IF
+          MINWRK = MAX( 2, MINWRK )
+          OPTWRK = MAX( OPTWRK, MINWRK )
+          IF ( LWORK  .LT. MINWRK  .AND. (.NOT.LQUERY) ) INFO = - 17
+          IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19   
+      END IF
+*      
+      IF ( INFO .NE. 0 ) THEN
+*       #:(
+         CALL XERBLA( 'CGEJSV', - INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+          CWORK(1) = OPTWRK
+          CWORK(2) = MINWRK
+          RWORK(1) = MINRWRK
+          IWORK(1) = MAX( 4, MINIWRK )
+          RETURN   
+      END IF
+*
+*     Quick return for void matrix (Y3K safe)
+* #:)
+      IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN
+         IWORK(1:4) = 0
+         RWORK(1:7) = 0
+         RETURN
+      ENDIF
+*
+*     Determine whether the matrix U should be M x N or M x M
+*
+      IF ( LSVEC ) THEN
+         N1 = N
+         IF ( LSAME( JOBU, 'F' ) ) N1 = M
+      END IF
+*
+*     Set numerical parameters
+*
+*!    NOTE: Make sure SLAMCH() does not fail on the target architecture.
+*
+      EPSLN = SLAMCH('Epsilon')
+      SFMIN = SLAMCH('SafeMinimum')
+      SMALL = SFMIN / EPSLN
+      BIG   = SLAMCH('O')
+*     BIG   = ONE / SFMIN
+*
+*     Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
+*
+*(!)  If necessary, scale SVA() to protect the largest norm from
+*     overflow. It is possible that this scaling pushes the smallest
+*     column norm left from the underflow threshold (extreme case).
+*
+      SCALEM  = ONE / SQRT(REAL(M)*REAL(N))
+      NOSCAL  = .TRUE.
+      GOSCAL  = .TRUE.
+      DO 1874 p = 1, N
+         AAPP = ZERO
+         AAQQ = ONE
+         CALL CLASSQ( M, A(1,p), 1, AAPP, AAQQ )
+         IF ( AAPP .GT. BIG ) THEN
+            INFO = - 9
+            CALL XERBLA( 'CGEJSV', -INFO )
+            RETURN
+         END IF
+         AAQQ = SQRT(AAQQ)
+         IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL  ) THEN
+            SVA(p)  = AAPP * AAQQ
+         ELSE
+            NOSCAL  = .FALSE.
+            SVA(p)  = AAPP * ( AAQQ * SCALEM )
+            IF ( GOSCAL ) THEN
+               GOSCAL = .FALSE.
+               CALL SSCAL( p-1, SCALEM, SVA, 1 )
+            END IF
+         END IF
+ 1874 CONTINUE
+*
+      IF ( NOSCAL ) SCALEM = ONE
+*
+      AAPP = ZERO
+      AAQQ = BIG
+      DO 4781 p = 1, N
+         AAPP = MAX( AAPP, SVA(p) )
+         IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) )
+ 4781 CONTINUE
+*
+*     Quick return for zero M x N matrix
+* #:)
+      IF ( AAPP .EQ. ZERO ) THEN
+         IF ( LSVEC ) CALL CLASET( 'G', M, N1, CZERO, CONE, U, LDU )
+         IF ( RSVEC ) CALL CLASET( 'G', N, N,  CZERO, CONE, V, LDV )
+         RWORK(1) = ONE
+         RWORK(2) = ONE
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         IWORK(1) = 0
+         IWORK(2) = 0
+         IWORK(3) = 0
+         IWORK(4) = -1
+         RETURN
+      END IF
+*
+*     Issue warning if denormalized column norms detected. Override the
+*     high relative accuracy request. Issue licence to kill nonzero columns
+*     (set them to zero) whose norm is less than sigma_max / BIG (roughly).
+* #:(
+      WARNING = 0
+      IF ( AAQQ .LE. SFMIN ) THEN
+         L2RANK = .TRUE.
+         L2KILL = .TRUE.
+         WARNING = 1
+      END IF
+*
+*     Quick return for one-column matrix
+* #:)
+      IF ( N .EQ. 1 ) THEN
+*
+         IF ( LSVEC ) THEN
+            CALL CLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR )
+            CALL CLACPY( 'A', M, 1, A, LDA, U, LDU )
+*           computing all M left singular vectors of the M x 1 matrix
+            IF ( N1 .NE. N  ) THEN
+              CALL CGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR )
+              CALL CUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR )
+              CALL CCOPY( M, A(1,1), 1, U(1,1), 1 )
+            END IF
+         END IF
+         IF ( RSVEC ) THEN
+             V(1,1) = CONE
+         END IF
+         IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN
+            SVA(1)  = SVA(1) / SCALEM
+            SCALEM  = ONE
+         END IF
+         RWORK(1) = ONE / SCALEM
+         RWORK(2) = ONE
+         IF ( SVA(1) .NE. ZERO ) THEN
+            IWORK(1) = 1
+            IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN
+               IWORK(2) = 1
+            ELSE
+               IWORK(2) = 0
+            END IF
+         ELSE
+            IWORK(1) = 0
+            IWORK(2) = 0
+         END IF
+         IWORK(3) = 0
+         IWORK(4) = -1
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         RETURN
+*
+      END IF
+*
+      TRANSP = .FALSE.
+*
+      AATMAX = -ONE
+      AATMIN =  BIG
+      IF ( ROWPIV .OR. L2TRAN ) THEN
+*
+*     Compute the row norms, needed to determine row pivoting sequence
+*     (in the case of heavily row weighted A, row pivoting is strongly
+*     advised) and to collect information needed to compare the
+*     structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.).
+*
+         IF ( L2TRAN ) THEN
+            DO 1950 p = 1, M
+               XSC   = ZERO
+               TEMP1 = ONE
+               CALL CLASSQ( N, A(p,1), LDA, XSC, TEMP1 )
+*              CLASSQ gets both the ell_2 and the ell_infinity norm
+*              in one pass through the vector
+               RWORK(M+p)  = XSC * SCALEM
+               RWORK(p)    = XSC * (SCALEM*SQRT(TEMP1))
+               AATMAX = MAX( AATMAX, RWORK(p) )
+               IF (RWORK(p) .NE. ZERO) 
+     $            AATMIN = MIN(AATMIN,RWORK(p))
+ 1950       CONTINUE
+         ELSE
+            DO 1904 p = 1, M
+               RWORK(M+p) = SCALEM*ABS( A(p,ICAMAX(N,A(p,1),LDA)) )
+               AATMAX = MAX( AATMAX, RWORK(M+p) )
+               AATMIN = MIN( AATMIN, RWORK(M+p) )
+ 1904       CONTINUE
+         END IF
+*
+      END IF
+*
+*     For square matrix A try to determine whether A^*  would be better
+*     input for the preconditioned Jacobi SVD, with faster convergence.
+*     The decision is based on an O(N) function of the vector of column
+*     and row norms of A, based on the Shannon entropy. This should give
+*     the right choice in most cases when the difference actually matters.
+*     It may fail and pick the slower converging side.
+*
+      ENTRA  = ZERO
+      ENTRAT = ZERO
+      IF ( L2TRAN ) THEN
+*
+         XSC   = ZERO
+         TEMP1 = ONE
+         CALL SLASSQ( N, SVA, 1, XSC, TEMP1 )
+         TEMP1 = ONE / TEMP1
+*
+         ENTRA = ZERO
+         DO 1113 p = 1, N
+            BIG1  = ( ( SVA(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * ALOG(BIG1)
+ 1113    CONTINUE
+         ENTRA = - ENTRA / ALOG(REAL(N))
+*
+*        Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex.
+*        It is derived from the diagonal of  A^* * A.  Do the same with the
+*        diagonal of A * A^*, compute the entropy of the corresponding
+*        probability distribution. Note that A * A^* and A^* * A have the
+*        same trace.
+*
+         ENTRAT = ZERO
+         DO 1114 p = 1, M
+            BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * ALOG(BIG1)
+ 1114    CONTINUE
+         ENTRAT = - ENTRAT / ALOG(REAL(M))
+*
+*        Analyze the entropies and decide A or A^*. Smaller entropy
+*        usually means better input for the algorithm.
+*
+         TRANSP = ( ENTRAT .LT. ENTRA )
+* 
+*        If A^* is better than A, take the adjoint of A. This is allowed
+*        only for square matrices, M=N.  
+         IF ( TRANSP ) THEN
+*           In an optimal implementation, this trivial transpose
+*           should be replaced with faster transpose.
+            DO 1115 p = 1, N - 1
+               A(p,p) = CONJG(A(p,p))
+               DO 1116 q = p + 1, N
+                   CTEMP = CONJG(A(q,p))
+                  A(q,p) = CONJG(A(p,q))
+                  A(p,q) = CTEMP
+ 1116          CONTINUE
+ 1115       CONTINUE
+            A(N,N) = CONJG(A(N,N))
+            DO 1117 p = 1, N
+               RWORK(M+p) = SVA(p)
+               SVA(p) = RWORK(p)
+*              previously computed row 2-norms are now column 2-norms
+*              of the transposed matrix
+ 1117       CONTINUE
+            TEMP1  = AAPP
+            AAPP   = AATMAX
+            AATMAX = TEMP1
+            TEMP1  = AAQQ
+            AAQQ   = AATMIN
+            AATMIN = TEMP1
+            KILL   = LSVEC
+            LSVEC  = RSVEC
+            RSVEC  = KILL
+            IF ( LSVEC ) N1 = N
+*
+            ROWPIV = .TRUE.
+         END IF
+*
+      END IF
+*     END IF L2TRAN
+*
+*     Scale the matrix so that its maximal singular value remains less
+*     than SQRT(BIG) -- the matrix is scaled so that its maximal column
+*     has Euclidean norm equal to SQRT(BIG/N). The only reason to keep
+*     SQRT(BIG) instead of BIG is the fact that CGEJSV uses LAPACK and
+*     BLAS routines that, in some implementations, are not capable of
+*     working in the full interval [SFMIN,BIG] and that they may provoke
+*     overflows in the intermediate results. If the singular values spread
+*     from SFMIN to BIG, then CGESVJ will compute them. So, in that case,
+*     one should use CGESVJ instead of CGEJSV.
+      BIG1   = SQRT( BIG )
+      TEMP1  = SQRT( BIG / REAL(N) )
+*     >> for future updates: allow bigger range, i.e. the largest column
+*     will be allowed up to BIG/N and CGESVJ will do the rest. However, for
+*     this all other (LAPACK) components must allow such a range.      
+*     TEMP1  = BIG/REAL(N)
+*     TEMP1  = BIG * EPSLN  this should 'almost' work with current LAPACK components
+      CALL SLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR )
+      IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN
+          AAQQ = ( AAQQ / AAPP ) * TEMP1
+      ELSE
+          AAQQ = ( AAQQ * TEMP1 ) / AAPP
+      END IF
+      TEMP1 = TEMP1 * SCALEM
+      CALL CLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR )
+*
+*     To undo scaling at the end of this procedure, multiply the
+*     computed singular values with USCAL2 / USCAL1.
+*
+      USCAL1 = TEMP1
+      USCAL2 = AAPP
+*
+      IF ( L2KILL ) THEN
+*        L2KILL enforces computation of nonzero singular values in
+*        the restricted range of condition number of the initial A,
+*        sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN).
+         XSC = SQRT( SFMIN )
+      ELSE
+         XSC = SMALL
+*
+*        Now, if the condition number of A is too big,
+*        sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN,
+*        as a precaution measure, the full SVD is computed using CGESVJ
+*        with accumulated Jacobi rotations. This provides numerically
+*        more robust computation, at the cost of slightly increased run
+*        time. Depending on the concrete implementation of BLAS and LAPACK
+*        (i.e. how they behave in presence of extreme ill-conditioning) the
+*        implementor may decide to remove this switch.
+         IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN
+            JRACC = .TRUE.
+         END IF
+*
+      END IF
+      IF ( AAQQ .LT. XSC ) THEN
+         DO 700 p = 1, N
+            IF ( SVA(p) .LT. XSC ) THEN
+               CALL CLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA )
+               SVA(p) = ZERO
+            END IF
+ 700     CONTINUE
+      END IF
+*
+*     Preconditioning using QR factorization with pivoting
+*
+      IF ( ROWPIV ) THEN
+*        Optional row permutation (Bjoerck row pivoting):
+*        A result by Cox and Higham shows that the Bjoerck's
+*        row pivoting combined with standard column pivoting
+*        has similar effect as Powell-Reid complete pivoting.
+*        The ell-infinity norms of A are made nonincreasing.
+         IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN 
+              IWOFF = 2*N
+         ELSE
+              IWOFF = N
+         END IF
+         DO 1952 p = 1, M - 1
+            q = ISAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1
+            IWORK(IWOFF+p) = q
+            IF ( p .NE. q ) THEN
+               TEMP1      = RWORK(M+p)
+               RWORK(M+p) = RWORK(M+q)
+               RWORK(M+q) = TEMP1
+            END IF
+ 1952    CONTINUE
+         CALL CLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 )
+      END IF
+*
+*     End of the preparation phase (scaling, optional sorting and
+*     transposing, optional flushing of small columns).
+*
+*     Preconditioning
+*
+*     If the full SVD is needed, the right singular vectors are computed
+*     from a matrix equation, and for that we need theoretical analysis
+*     of the Businger-Golub pivoting. So we use CGEQP3 as the first RR QRF.
+*     In all other cases the first RR QRF can be chosen by other criteria
+*     (eg speed by replacing global with restricted window pivoting, such
+*     as in xGEQPX from TOMS # 782). Good results will be obtained using
+*     xGEQPX with properly (!) chosen numerical parameters.
+*     Any improvement of CGEQP3 improves overall performance of CGEJSV.
+*
+*     A * P1 = Q1 * [ R1^* 0]^*:
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL CGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N,
+     $             RWORK, IERR )
+*
+*     The upper triangular matrix R1 from the first QRF is inspected for
+*     rank deficiency and possibilities for deflation, or possible
+*     ill-conditioning. Depending on the user specified flag L2RANK,
+*     the procedure explores possibilities to reduce the numerical
+*     rank by inspecting the computed upper triangular factor. If
+*     L2RANK or L2ABER are up, then CGEJSV will compute the SVD of
+*     A + dA, where ||dA|| <= f(M,N)*EPSLN.
+*
+      NR = 1
+      IF ( L2ABER ) THEN
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPSLN*||A||.
+         TEMP1 = SQRT(REAL(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN
+               NR = NR + 1
+            ELSE
+               GO TO 3002
+            END IF
+ 3001    CONTINUE
+ 3002    CONTINUE
+      ELSE IF ( L2RANK ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R1 is used as the criterion for
+*        close-to-rank-deficient.
+         TEMP1 = SQRT(SFMIN)
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        The goal is high relative accuracy. However, if the matrix
+*        has high scaled condition number the relative accuracy is in
+*        general not feasible. Later on, a condition number estimator
+*        will be deployed to estimate the scaled condition number.
+*        Here we just remove the underflowed part of the triangular
+*        factor. This prevents the situation in which the code is
+*        working hard to get the accuracy not warranted by the data.
+         TEMP1  = SQRT(SFMIN)
+         DO 3301 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302
+            NR = NR + 1
+ 3301    CONTINUE
+ 3302    CONTINUE
+*
+      END IF
+*
+      ALMORT = .FALSE.
+      IF ( NR .EQ. N ) THEN
+         MAXPRJ = ONE
+         DO 3051 p = 2, N
+            TEMP1  = ABS(A(p,p)) / SVA(IWORK(p))
+            MAXPRJ = MIN( MAXPRJ, TEMP1 )
+ 3051    CONTINUE
+         IF ( MAXPRJ**2 .GE. ONE - REAL(N)*EPSLN ) ALMORT = .TRUE.
+      END IF
+*
+*
+      SCONDA = - ONE
+      CONDR1 = - ONE
+      CONDR2 = - ONE
+*
+      IF ( ERREST ) THEN
+         IF ( N .EQ. NR ) THEN
+            IF ( RSVEC ) THEN
+*              .. V is available as workspace
+               CALL CLACPY( 'U', N, N, A, LDA, V, LDV )
+               DO 3053 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL CSSCAL( p, ONE/TEMP1, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( LSVEC )THEN
+                   CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK(N+1), RWORK, IERR )
+               ELSE
+                   CALL CPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK, RWORK, IERR )
+               END IF               
+*          
+            ELSE IF ( LSVEC ) THEN
+*              .. U is available as workspace
+               CALL CLACPY( 'U', N, N, A, LDA, U, LDU )
+               DO 3054 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL CSSCAL( p, ONE/TEMP1, U(1,p), 1 )
+ 3054          CONTINUE
+               CALL CPOCON( 'U', N, U, LDU, ONE, TEMP1,
+     $              CWORK(N+1), RWORK, IERR )
+            ELSE
+               CALL CLACPY( 'U', N, N, A, LDA, CWORK, N )
+*[]            CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+*              Change: here index shifted by N to the left, CWORK(1:N) 
+*              not needed for SIGMA only computation
+               DO 3052 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+*[]               CALL CSSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 )
+                  CALL CSSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 )
+ 3052          CONTINUE
+*           .. the columns of R are scaled to have unit Euclidean lengths.
+*[]               CALL CPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1,
+*[]     $              CWORK(N+N*N+1), RWORK, IERR )
+               CALL CPOCON( 'U', N, CWORK, N, ONE, TEMP1,
+     $              CWORK(N*N+1), RWORK, IERR )               
+*              
+            END IF
+            IF ( TEMP1 .NE. ZERO ) THEN 
+               SCONDA = ONE / SQRT(TEMP1)
+            ELSE
+               SCONDA = - ONE
+            END IF
+*           SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+         ELSE
+            SCONDA = - ONE
+         END IF
+      END IF
+*
+      L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) )
+*     If there is no violent scaling, artificial perturbation is not needed.
+*
+*     Phase 3:
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*
+*         Singular Values only
+*
+*         .. transpose A(1:NR,1:N)
+         DO 1946 p = 1, MIN( N-1, NR )
+            CALL CCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 )
+            CALL CLACGV( N-p+1, A(p,p), 1 )
+ 1946    CONTINUE
+         IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N))
+*
+*        The following two DO-loops introduce small relative perturbation
+*        into the strict upper triangle of the lower triangular matrix.
+*        Small entries below the main diagonal are also changed.
+*        This modification is useful if the computing environment does not
+*        provide/allow FLUSH TO ZERO underflow, for it prevents many
+*        annoying denormalized numbers in case of strongly scaled matrices.
+*        The perturbation is structured so that it does not introduce any
+*        new perturbation of the singular values, and it does not destroy
+*        the job done by the preconditioner.
+*        The licence for this perturbation is in the variable L2PERT, which
+*        should be .FALSE. if FLUSH TO ZERO underflow is active.
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / REAL(N)
+               DO 4947 q = 1, NR
+                  CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 4949 p = 1, N
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                    .OR. ( p .LT. q ) )
+*     $                     A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                     A(p,q) = CTEMP
+ 4949             CONTINUE
+ 4947          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA )
+            END IF
+*
+*            .. second preconditioning using the QR factorization
+*
+            CALL CGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR )
+*
+*           .. and transpose upper to lower triangular
+            DO 1948 p = 1, NR - 1
+               CALL CCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 )
+               CALL CLACGV( NR-p+1, A(p,p), 1 )
+ 1948       CONTINUE
+*
+         END IF
+*
+*           Row-cyclic Jacobi SVD algorithm with column pivoting
+*
+*           .. again some perturbation (a "background noise") is added
+*           to drown denormals
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / REAL(N)
+               DO 1947 q = 1, NR
+                  CTEMP = CMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 1949 p = 1, NR
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                       .OR. ( p .LT. q ) )
+*     $                   A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                   A(p,q) = CTEMP
+ 1949             CONTINUE
+ 1947          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA )
+            END IF
+*
+*           .. and one-sided Jacobi rotations are started on a lower
+*           triangular matrix (plus perturbation which is ignored in
+*           the part which destroys triangular form (confusing?!))
+*
+            CALL CGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA,
+     $                N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+*
+*
+      ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) ) 
+     $       .OR. 
+     $   ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN
+*
+*        -> Singular Values and Right Singular Vectors <-
+*
+         IF ( ALMORT ) THEN
+*
+*           .. in this case NR equals N
+            DO 1998 p = 1, NR
+               CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL CLACGV( N-p+1, V(p,p), 1 )
+ 1998       CONTINUE
+            CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+*
+            CALL CGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA,
+     $                  CWORK, LWORK, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+
+         ELSE
+*
+*        .. two more QR factorizations ( one QRF is not enough, two require
+*        accumulated product of Jacobi rotations, three are perfect )
+*
+            CALL CLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA )
+            CALL CGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR)
+            CALL CLACPY( 'L', NR, NR, A, LDA, V, LDV )
+            CALL CLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+            CALL CGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $                   LWORK-2*N, IERR )
+            DO 8998 p = 1, NR
+               CALL CCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 )
+               CALL CLACGV( NR-p+1, V(p,p), 1 )
+ 8998       CONTINUE
+            CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV)
+*
+            CALL CGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U,
+     $                  LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            IF ( NR .LT. N ) THEN
+               CALL CLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1),  LDV )
+               CALL CLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1),  LDV )
+               CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV )
+            END IF
+*
+         CALL CUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK,
+     $               V, LDV, CWORK(N+1), LWORK-N, IERR )
+*
+         END IF
+*         .. permute the rows of V
+*         DO 8991 p = 1, N
+*            CALL CCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA )
+* 8991    CONTINUE
+*         CALL CLACPY( 'All', N, N, A, LDA, V, LDV )
+         CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+          IF ( TRANSP ) THEN
+            CALL CLACPY( 'A', N, N, V, LDV, U, LDU )
+          END IF
+*
+      ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN 
+*          
+         CALL CLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA )
+*
+         CALL CGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV,
+     $               CWORK, LWORK, RWORK, LRWORK, INFO )
+          SCALEM  = RWORK(1)
+          NUMRANK = NINT(RWORK(2))
+          CALL CLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN
+*
+*        .. Singular Values and Left Singular Vectors                 ..
+*
+*        .. second preconditioning step to avoid need to accumulate
+*        Jacobi rotations in the Jacobi iterations.
+         DO 1965 p = 1, NR
+            CALL CCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 )
+            CALL CLACGV( N-p+1, U(p,p), 1 )
+ 1965    CONTINUE
+         CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL CGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+         DO 1967 p = 1, NR - 1
+            CALL CCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 )
+            CALL CLACGV( N-p+1, U(p,p), 1 )
+ 1967    CONTINUE
+         CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL CGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A,
+     $        LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+*
+         IF ( NR .LT. M ) THEN
+            CALL CLASET( 'A',  M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL CLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU )
+               CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU )
+            END IF
+         END IF
+*
+         CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $               LDU, CWORK(N+1), LWORK-N, IERR )
+*
+         IF ( ROWPIV )
+     $       CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         DO 1974 p = 1, N1
+            XSC = ONE / SCNRM2( M, U(1,p), 1 )
+            CALL CSSCAL( M, XSC, U(1,p), 1 )
+ 1974    CONTINUE
+*
+         IF ( TRANSP ) THEN
+            CALL CLACPY( 'A', N, N, U, LDU, V, LDV )
+         END IF
+*
+      ELSE
+*
+*        .. Full SVD ..
+*
+         IF ( .NOT. JRACC ) THEN
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+*           Second Preconditioning Step (QRF [with pivoting])
+*           Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
+*           equivalent to an LQF CALL. Since in many libraries the QRF
+*           seems to be better optimized than the LQF, we do explicit
+*           transpose and use the QRF. This is subject to changes in an
+*           optimized implementation of CGEJSV.
+*
+            DO 1968 p = 1, NR
+               CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL CLACGV( N-p+1, V(p,p), 1 )
+ 1968       CONTINUE
+*
+*           .. the following two loops perturb small entries to avoid
+*           denormals in the second QR factorization, where they are
+*           as good as zeros. This is done to avoid painfully slow
+*           computation with denormals. The relative size of the perturbation
+*           is a parameter that can be changed by the implementer.
+*           This perturbation device will be obsolete on machines with
+*           properly implemented arithmetic.
+*           To switch it off, set L2PERT=.FALSE. To remove it from  the
+*           code, remove the action under L2PERT=.TRUE., leave the ELSE part.
+*           The following two loops should be blocked and fused with the
+*           transposed copy above.
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 2969 q = 1, NR
+                  CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO)
+                  DO 2968 p = 1, N
+                     IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                   .OR. ( p .LT. q ) )
+*     $                   V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                   V(p,q) = CTEMP
+                     IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 2968             CONTINUE
+ 2969          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+            END IF
+*
+*           Estimate the row scaled condition number of R1
+*           (If R1 is rectangular, N > NR, then the condition number
+*           of the leading NR x NR submatrix is estimated.)
+*
+            CALL CLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR )
+            DO 3950 p = 1, NR
+               TEMP1 = SCNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1)
+               CALL CSSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1)
+ 3950       CONTINUE
+            CALL CPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1,
+     $                   CWORK(2*N+NR*NR+1),RWORK,IERR)
+            CONDR1 = ONE / SQRT(TEMP1)
+*           .. here need a second opinion on the condition number
+*           .. then assume worst case scenario
+*           R1 is OK for inverse <=> CONDR1 .LT. REAL(N)
+*           more conservative    <=> CONDR1 .LT. SQRT(REAL(N))
+*
+            COND_OK = SQRT(SQRT(REAL(NR)))
+*[TP]       COND_OK is a tuning parameter.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*              .. the second QRF without pivoting. Note: in an optimized
+*              implementation, this QRF should be implemented as the QRF
+*              of a lower triangular matrix.
+*              R1^* = Q2 * R2
+               CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)/EPSLN
+                  DO 3959 p = 2, NR
+                     DO 3958 q = 1, p - 1
+                        CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                              ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3958                CONTINUE
+ 3959             CONTINUE
+               END IF
+*
+               IF ( NR .NE. N )
+     $         CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*              .. save ...
+*
+*           .. this transposed copy should be better than naive
+               DO 1969 p = 1, NR - 1
+                  CALL CCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 )
+                  CALL CLACGV(NR-p+1, V(p,p), 1 )
+ 1969          CONTINUE
+               V(NR,NR)=CONJG(V(NR,NR))
+*
+               CONDR2 = CONDR1
+*
+            ELSE
+*
+*              .. ill-conditioned case: second QRF with pivoting
+*              Note that windowed pivoting would be equally good
+*              numerically, and more run-time efficient. So, in
+*              an optimal implementation, the next call to CGEQP3
+*              should be replaced with eg. CALL CGEQPX (ACM TOMS #782)
+*              with properly (carefully) chosen parameters.
+*
+*              R1^* * P2 = Q2 * R2
+               DO 3003 p = 1, NR
+                  IWORK(N+p) = 0
+ 3003          CONTINUE
+               CALL CGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1),
+     $                  CWORK(2*N+1), LWORK-2*N, RWORK, IERR )
+**               CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+**     $              LWORK-2*N, IERR )
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 3969 p = 2, NR
+                     DO 3968 q = 1, p - 1
+                        CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                                ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3968                CONTINUE
+ 3969             CONTINUE
+               END IF
+*
+               CALL CLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 8970 p = 2, NR
+                     DO 8971 q = 1, p - 1
+                        CTEMP=CMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                               ZERO)
+*                        V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) )
+                        V(p,q) = - CTEMP
+ 8971                CONTINUE
+ 8970             CONTINUE
+               ELSE
+                  CALL CLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV )
+               END IF
+*              Now, compute R2 = L3 * Q3, the LQ factorization.
+               CALL CGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1),
+     $               CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR )
+*              .. and estimate the condition number
+               CALL CLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR )
+               DO 4950 p = 1, NR
+                  TEMP1 = SCNRM2( p, CWORK(2*N+N*NR+NR+p), NR )
+                  CALL CSSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR )
+ 4950          CONTINUE
+               CALL CPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1,
+     $              CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR )
+               CONDR2 = ONE / SQRT(TEMP1)
+*
+*
+               IF ( CONDR2 .GE. COND_OK ) THEN
+*                 .. save the Householder vectors used for Q3
+*                 (this overwrites the copy of R2, as it will not be
+*                 needed in this branch, but it does not overwritte the
+*                 Huseholder vectors of Q2.).
+                  CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
+*                 .. and the rest of the information on Q3 is in
+*                 WORK(2*N+N*NR+1:2*N+N*NR+N)
+               END IF
+*
+            END IF
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 4968 q = 2, NR
+                  CTEMP = XSC * V(q,q)
+                  DO 4969 p = 1, q - 1
+*                     V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) )
+                     V(p,q) = - CTEMP
+ 4969             CONTINUE
+ 4968          CONTINUE
+            ELSE
+               CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+            END IF
+*
+*        Second preconditioning finished; continue with Jacobi SVD
+*        The input matrix is lower trinagular.
+*
+*        Recover the right singular vectors as solution of a well
+*        conditioned triangular matrix equation.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*
+               CALL CGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU,
+     $              CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK,
+     $              LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3970 p = 1, NR
+                  CALL CCOPY(  NR, V(1,p), 1, U(1,p), 1 )
+                  CALL CSSCAL( NR, SVA(p),    V(1,p), 1 )
+ 3970          CONTINUE
+
+*        .. pick the right matrix equation and solve it
+*
+               IF ( NR .EQ. N ) THEN
+* :))             .. best case, R1 is inverted. The solution of this matrix
+*                 equation is Q2*V2 = the product of the Jacobi rotations
+*                 used in CGESVJ, premultiplied with the orthogonal matrix
+*                 from the second QR factorization.
+                  CALL CTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV)
+               ELSE
+*                 .. R1 is well conditioned, but non-square. Adjoint of R2
+*                 is inverted to get the product of the Jacobi rotations
+*                 used in CGESVJ. The Q-factor from the second QR
+*                 factorization is then built in explicitly.
+                  CALL CTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1),
+     $                 N,V,LDV)
+                  IF ( NR .LT. N ) THEN
+                  CALL CLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL CLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  END IF
+                  CALL CUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $                V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR)
+               END IF
+*
+            ELSE IF ( CONDR2 .LT. COND_OK ) THEN
+*
+*              The matrix R2 is inverted. The solution of the matrix equation
+*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              the lower triangular L3 from the LQ factorization of
+*              R2=L3*Q3), pre-multiplied with the transposed Q3.
+               CALL CGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $              RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3870 p = 1, NR
+                  CALL CCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL CSSCAL( NR, SVA(p),    U(1,p), 1 )
+ 3870          CONTINUE
+               CALL CTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N,
+     $                    U,LDU)
+*              .. apply the permutation from the second QR factorization
+               DO 873 q = 1, NR
+                  DO 872 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 872              CONTINUE
+                  DO 874 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 874              CONTINUE
+ 873           CONTINUE
+               IF ( NR .LT. N ) THEN
+                  CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+            ELSE
+*              Last line of defense.
+* #:(          This is a rather pathological case: no scaled condition
+*              improvement after two pivoted QR factorizations. Other
+*              possibility is that the rank revealing QR factorization
+*              or the condition estimator has failed, or the COND_OK
+*              is set very close to ONE (which is unnecessary). Normally,
+*              this branch should never be executed, but in rare cases of
+*              failure of the RRQR or condition estimator, the last line of
+*              defense ensures that CGEJSV completes the task.
+*              Compute the full SVD of L3 using CGESVJ with explicit
+*              accumulation of Jacobi rotations.
+               CALL CGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $                         RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               IF ( NR .LT. N ) THEN
+                  CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+               CALL CUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N,
+     $              CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1),
+     $              LWORK-2*N-N*NR-NR, IERR )
+               DO 773 q = 1, NR
+                  DO 772 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 772              CONTINUE
+                  DO 774 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 774              CONTINUE
+ 773           CONTINUE
+*
+            END IF
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(REAL(N)) * EPSLN
+            DO 1972 q = 1, N
+               DO 972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+  972          CONTINUE
+               DO 973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+  973          CONTINUE
+               XSC = ONE / SCNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL CSSCAL( N, XSC, V(1,q), 1 )
+ 1972       CONTINUE
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+            IF ( NR .LT. M ) THEN
+               CALL CLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU)
+               IF ( NR .LT. N1 ) THEN
+                  CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                  CALL CLASET('A',M-NR,N1-NR,CZERO,CONE,
+     $                        U(NR+1,NR+1),LDU)
+               END IF
+            END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           matrix U. This applies to all cases.
+*
+            CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+
+*           The columns of U are normalized. The cost is O(M*N) flops.
+            TEMP1 = SQRT(REAL(M)) * EPSLN
+            DO 1973 p = 1, NR
+               XSC = ONE / SCNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $          CALL CSSCAL( M, XSC, U(1,p), 1 )
+ 1973       CONTINUE
+*
+*           If the initial QRF is computed with row pivoting, the left
+*           singular vectors must be adjusted.
+*
+            IF ( ROWPIV )
+     $          CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         ELSE
+*
+*        .. the initial matrix A has almost orthogonal columns and
+*        the second QRF is not needed
+*
+            CALL CLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 5970 p = 2, N
+                  CTEMP = XSC * CWORK( N + (p-1)*N + p )
+                  DO 5971 q = 1, p - 1
+*                     CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) /
+*     $                                        ABS(CWORK(N+(p-1)*N+q)) )
+                     CWORK(N+(q-1)*N+p)=-CTEMP
+ 5971             CONTINUE
+ 5970          CONTINUE
+            ELSE
+               CALL CLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N )
+            END IF
+*
+            CALL CGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA,
+     $           N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK,
+     $       INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            DO 6970 p = 1, N
+               CALL CCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 )
+               CALL CSSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 )
+ 6970       CONTINUE
+*
+            CALL CTRSM( 'L', 'U', 'N', 'N', N, N,
+     $           CONE, A, LDA, CWORK(N+1), N )
+            DO 6972 p = 1, N
+               CALL CCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV )
+ 6972       CONTINUE
+            TEMP1 = SQRT(REAL(N))*EPSLN
+            DO 6971 p = 1, N
+               XSC = ONE / SCNRM2( N, V(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL CSSCAL( N, XSC, V(1,p), 1 )
+ 6971       CONTINUE
+*
+*           Assemble the left singular vector matrix U (M x N).
+*
+            IF ( N .LT. M ) THEN
+               CALL CLASET( 'A',  M-N, N, CZERO, CZERO, U(N+1,1), LDU )
+               IF ( N .LT. N1 ) THEN
+                  CALL CLASET('A',N,  N1-N, CZERO, CZERO,  U(1,N+1),LDU)
+                  CALL CLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU)
+               END IF
+            END IF
+            CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+            TEMP1 = SQRT(REAL(M))*EPSLN
+            DO 6973 p = 1, N1
+               XSC = ONE / SCNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL CSSCAL( M, XSC, U(1,p), 1 )
+ 6973       CONTINUE
+*
+            IF ( ROWPIV )
+     $         CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         END IF
+*
+*        end of the  >> almost orthogonal case <<  in the full SVD
+*
+         ELSE
+*
+*        This branch deploys a preconditioned Jacobi SVD with explicitly
+*        accumulated rotations. It is included as optional, mainly for
+*        experimental purposes. It does perform well, and can also be used.
+*        In this implementation, this branch will be automatically activated
+*        if the  condition number sigma_max(A) / sigma_min(A) is predicted
+*        to be greater than the overflow threshold. This is because the
+*        a posteriori computation of the singular vectors assumes robust
+*        implementation of BLAS and some LAPACK procedures, capable of working
+*        in presence of extreme values, e.g. when the singular values spread from
+*        the underflow to the overflow threshold. 
+*
+         DO 7968 p = 1, NR
+            CALL CCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+            CALL CLACGV( N-p+1, V(p,p), 1 )
+ 7968    CONTINUE
+*
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 5969 q = 1, NR
+               CTEMP = CMPLX(XSC*ABS( V(q,q) ),ZERO)
+               DO 5968 p = 1, N
+                  IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                .OR. ( p .LT. q ) )
+*     $                V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                V(p,q) = CTEMP
+                  IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 5968          CONTINUE
+ 5969       CONTINUE
+         ELSE
+            CALL CLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+         END IF
+
+         CALL CGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $        LWORK-2*N, IERR )
+         CALL CLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+         DO 7969 p = 1, NR
+            CALL CCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
+            CALL CLACGV( NR-p+1, U(p,p), 1 )
+ 7969    CONTINUE
+
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 9970 q = 2, NR
+               DO 9971 p = 1, q - 1
+                  CTEMP = CMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))),
+     $                           ZERO)
+*                  U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) )
+                  U(p,q) = - CTEMP
+ 9971          CONTINUE
+ 9970       CONTINUE
+         ELSE
+            CALL CLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+         END IF
+
+         CALL CGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA,
+     $        N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR,
+     $         RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+
+         IF ( NR .LT. N ) THEN
+            CALL CLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+            CALL CLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+            CALL CLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV )
+         END IF
+
+         CALL CUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $        V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(REAL(N)) * EPSLN
+            DO 7972 q = 1, N
+               DO 8972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+ 8972          CONTINUE
+               DO 8973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+ 8973          CONTINUE
+               XSC = ONE / SCNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL CSSCAL( N, XSC, V(1,q), 1 )
+ 7972       CONTINUE
+*
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+*
+         IF ( NR .LT. M ) THEN
+            CALL CLASET( 'A',  M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL CLASET('A',NR,  N1-NR, CZERO, CZERO,  U(1,NR+1),LDU)
+               CALL CLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU)
+            END IF
+         END IF
+*
+         CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $        LDU, CWORK(N+1), LWORK-N, IERR )
+*
+            IF ( ROWPIV )
+     $         CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+*
+         END IF
+         IF ( TRANSP ) THEN
+*           .. swap U and V because the procedure worked on A^*
+            DO 6974 p = 1, N
+               CALL CSWAP( N, U(1,p), 1, V(1,p), 1 )
+ 6974       CONTINUE
+         END IF
+*
+      END IF
+*     end of the full SVD
+*
+*     Undo scaling, if necessary (and possible)
+*
+      IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN
+         CALL SLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
+         USCAL1 = ONE
+         USCAL2 = ONE
+      END IF
+*
+      IF ( NR .LT. N ) THEN
+         DO 3004 p = NR+1, N
+            SVA(p) = ZERO
+ 3004    CONTINUE
+      END IF
+*
+      RWORK(1) = USCAL2 * SCALEM
+      RWORK(2) = USCAL1
+      IF ( ERREST ) RWORK(3) = SCONDA
+      IF ( LSVEC .AND. RSVEC ) THEN
+         RWORK(4) = CONDR1
+         RWORK(5) = CONDR2
+      END IF
+      IF ( L2TRAN ) THEN
+         RWORK(6) = ENTRA
+         RWORK(7) = ENTRAT
+      END IF
+*
+      IWORK(1) = NR
+      IWORK(2) = NUMRANK
+      IWORK(3) = WARNING
+      IF ( TRANSP ) THEN
+          IWORK(4) =  1 
+      ELSE
+          IWORK(4) = -1
+      END IF 
+      
+*
+      RETURN
+*     ..
+*     .. END OF CGEJSV
+*     ..
+      END
+*
diff --git a/lapack-netlib/cgesvx.f b/lapack-netlib/cgesvx.f
new file mode 100644
index 000000000..74a37e9a0
--- /dev/null
+++ b/lapack-netlib/cgesvx.f
@@ -0,0 +1,602 @@
+*> \brief <b> CGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX            A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGESVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is COMPLEX array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by CGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by CGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (MAX(1,2*N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complexGEsolve
+*
+*  =====================================================================
+      SUBROUTINE CGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      REAL               BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX            A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               CLANGE, CLANTR, SLAMCH
+      EXTERNAL           LSAME, CLANGE, CLANTR, SLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGECON, CGEEQU, CGERFS, CGETRF, CGETRS, CLACPY,
+     $                   CLAQGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL CGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL CLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL CLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL CGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = CLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = CLANGE( 'M', N, INFO, A, LDA, RWORK ) /
+     $                  RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = CLANGE( NORM, N, N, A, LDA, RWORK )
+      RPVGRW = CLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = CLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL CGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL CLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL CGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL CGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of CGESVX
+*
+      END
diff --git a/lapack-netlib/dgbsvx.f b/lapack-netlib/dgbsvx.f
new file mode 100644
index 000000000..0ee5eecb3
--- /dev/null
+++ b/lapack-netlib/dgbsvx.f
@@ -0,0 +1,639 @@
+*> \brief <b> DGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       DOUBLE PRECISION   AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGBSVX uses the LU factorization to compute the solution to a real
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is DOUBLE PRECISION array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is DOUBLE PRECISION array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by DGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by DGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is DOUBLE PRECISION array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,3*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGBsolve
+*
+*  =====================================================================
+      SUBROUTINE DGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      DOUBLE PRECISION   AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, DLANGB, DLANTB
+      EXTERNAL           LSAME, DLAMCH, DLANGB, DLANTB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DGBCON, DGBEQU, DGBRFS, DGBTRF, DGBTRS,
+     $                   DLACPY, DLAQGB, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL DGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL DLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL DCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL DGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = DLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = DLANGB( NORM, N, KL, KU, AB, LDAB, WORK )
+      RPVGRW = DLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = DLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL DGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL DGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL DGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      WORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of DGBSVX
+*
+      END
diff --git a/lapack-netlib/dgejsv.f b/lapack-netlib/dgejsv.f
new file mode 100644
index 000000000..ee769bb38
--- /dev/null
+++ b/lapack-netlib/dgejsv.f
@@ -0,0 +1,1780 @@
+*> \brief \b DGEJSV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGEJSV + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgejsv.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgejsv.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgejsv.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+*                          M, N, A, LDA, SVA, U, LDU, V, LDV,
+*                          WORK, LWORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       IMPLICIT    NONE
+*       INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ),
+*      $            WORK( LWORK )
+*       INTEGER     IWORK( * )
+*       CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGEJSV computes the singular value decomposition (SVD) of a real M-by-N
+*> matrix [A], where M >= N. The SVD of [A] is written as
+*>
+*>              [A] = [U] * [SIGMA] * [V]^t,
+*>
+*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+*> diagonal elements, [U] is an M-by-N (or M-by-M) orthonormal matrix, and
+*> [V] is an N-by-N orthogonal matrix. The diagonal elements of [SIGMA] are
+*> the singular values of [A]. The columns of [U] and [V] are the left and
+*> the right singular vectors of [A], respectively. The matrices [U] and [V]
+*> are computed and stored in the arrays U and V, respectively. The diagonal
+*> of [SIGMA] is computed and stored in the array SVA.
+*> DGEJSV can sometimes compute tiny singular values and their singular vectors much
+*> more accurately than other SVD routines, see below under Further Details.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBA
+*> \verbatim
+*>          JOBA is CHARACTER*1
+*>        Specifies the level of accuracy:
+*>       = 'C': This option works well (high relative accuracy) if A = B * D,
+*>             with well-conditioned B and arbitrary diagonal matrix D.
+*>             The accuracy cannot be spoiled by COLUMN scaling. The
+*>             accuracy of the computed output depends on the condition of
+*>             B, and the procedure aims at the best theoretical accuracy.
+*>             The relative error max_{i=1:N}|d sigma_i| / sigma_i is
+*>             bounded by f(M,N)*epsilon* cond(B), independent of D.
+*>             The input matrix is preprocessed with the QRF with column
+*>             pivoting. This initial preprocessing and preconditioning by
+*>             a rank revealing QR factorization is common for all values of
+*>             JOBA. Additional actions are specified as follows:
+*>       = 'E': Computation as with 'C' with an additional estimate of the
+*>             condition number of B. It provides a realistic error bound.
+*>       = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings
+*>             D1, D2, and well-conditioned matrix C, this option gives
+*>             higher accuracy than the 'C' option. If the structure of the
+*>             input matrix is not known, and relative accuracy is
+*>             desirable, then this option is advisable. The input matrix A
+*>             is preprocessed with QR factorization with FULL (row and
+*>             column) pivoting.
+*>       = 'G': Computation as with 'F' with an additional estimate of the
+*>             condition number of B, where A=D*B. If A has heavily weighted
+*>             rows, then using this condition number gives too pessimistic
+*>             error bound.
+*>       = 'A': Small singular values are the noise and the matrix is treated
+*>             as numerically rank deficient. The error in the computed
+*>             singular values is bounded by f(m,n)*epsilon*||A||.
+*>             The computed SVD A = U * S * V^t restores A up to
+*>             f(m,n)*epsilon*||A||.
+*>             This gives the procedure the licence to discard (set to zero)
+*>             all singular values below N*epsilon*||A||.
+*>       = 'R': Similar as in 'A'. Rank revealing property of the initial
+*>             QR factorization is used do reveal (using triangular factor)
+*>             a gap sigma_{r+1} < epsilon * sigma_r in which case the
+*>             numerical RANK is declared to be r. The SVD is computed with
+*>             absolute error bounds, but more accurately than with 'A'.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>        Specifies whether to compute the columns of U:
+*>       = 'U': N columns of U are returned in the array U.
+*>       = 'F': full set of M left sing. vectors is returned in the array U.
+*>       = 'W': U may be used as workspace of length M*N. See the description
+*>             of U.
+*>       = 'N': U is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>        Specifies whether to compute the matrix V:
+*>       = 'V': N columns of V are returned in the array V; Jacobi rotations
+*>             are not explicitly accumulated.
+*>       = 'J': N columns of V are returned in the array V, but they are
+*>             computed as the product of Jacobi rotations. This option is
+*>             allowed only if JOBU .NE. 'N', i.e. in computing the full SVD.
+*>       = 'W': V may be used as workspace of length N*N. See the description
+*>             of V.
+*>       = 'N': V is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>        Specifies the RANGE for the singular values. Issues the licence to
+*>        set to zero small positive singular values if they are outside
+*>        specified range. If A .NE. 0 is scaled so that the largest singular
+*>        value of c*A is around DSQRT(BIG), BIG=SLAMCH('O'), then JOBR issues
+*>        the licence to kill columns of A whose norm in c*A is less than
+*>        DSQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
+*>        where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E').
+*>       = 'N': Do not kill small columns of c*A. This option assumes that
+*>             BLAS and QR factorizations and triangular solvers are
+*>             implemented to work in that range. If the condition of A
+*>             is greater than BIG, use DGESVJ.
+*>       = 'R': RESTRICTED range for sigma(c*A) is [DSQRT(SFMIN), DSQRT(BIG)]
+*>             (roughly, as described above). This option is recommended.
+*>                                            ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*>        For computing the singular values in the FULL range [SFMIN,BIG]
+*>        use DGESVJ.
+*> \endverbatim
+*>
+*> \param[in] JOBT
+*> \verbatim
+*>          JOBT is CHARACTER*1
+*>        If the matrix is square then the procedure may determine to use
+*>        transposed A if A^t seems to be better with respect to convergence.
+*>        If the matrix is not square, JOBT is ignored. This is subject to
+*>        changes in the future.
+*>        The decision is based on two values of entropy over the adjoint
+*>        orbit of A^t * A. See the descriptions of WORK(6) and WORK(7).
+*>       = 'T': transpose if entropy test indicates possibly faster
+*>        convergence of Jacobi process if A^t is taken as input. If A is
+*>        replaced with A^t, then the row pivoting is included automatically.
+*>       = 'N': do not speculate.
+*>        This option can be used to compute only the singular values, or the
+*>        full SVD (U, SIGMA and V). For only one set of singular vectors
+*>        (U or V), the caller should provide both U and V, as one of the
+*>        matrices is used as workspace if the matrix A is transposed.
+*>        The implementer can easily remove this constraint and make the
+*>        code more complicated. See the descriptions of U and V.
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>          JOBP is CHARACTER*1
+*>        Issues the licence to introduce structured perturbations to drown
+*>        denormalized numbers. This licence should be active if the
+*>        denormals are poorly implemented, causing slow computation,
+*>        especially in cases of fast convergence (!). For details see [1,2].
+*>        For the sake of simplicity, this perturbations are included only
+*>        when the full SVD or only the singular values are requested. The
+*>        implementer/user can easily add the perturbation for the cases of
+*>        computing one set of singular vectors.
+*>       = 'P': introduce perturbation
+*>       = 'N': do not perturb
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>         The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>         The number of columns of the input matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] SVA
+*> \verbatim
+*>          SVA is DOUBLE PRECISION array, dimension (N)
+*>          On exit,
+*>          - For WORK(1)/WORK(2) = ONE: The singular values of A. During the
+*>            computation SVA contains Euclidean column norms of the
+*>            iterated matrices in the array A.
+*>          - For WORK(1) .NE. WORK(2): The singular values of A are
+*>            (WORK(1)/WORK(2)) * SVA(1:N). This factored form is used if
+*>            sigma_max(A) overflows or if small singular values have been
+*>            saved from underflow by scaling the input matrix A.
+*>          - If JOBR='R' then some of the singular values may be returned
+*>            as exact zeros obtained by "set to zero" because they are
+*>            below the numerical rank threshold or are denormalized numbers.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is DOUBLE PRECISION array, dimension ( LDU, N ) or ( LDU, M )
+*>          If JOBU = 'U', then U contains on exit the M-by-N matrix of
+*>                         the left singular vectors.
+*>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
+*>                         the left singular vectors, including an ONB
+*>                         of the orthogonal complement of the Range(A).
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
+*>                         then U is used as workspace if the procedure
+*>                         replaces A with A^t. In that case, [V] is computed
+*>                         in U as left singular vectors of A^t and then
+*>                         copied back to the V array. This 'W' option is just
+*>                         a reminder to the caller that in this case U is
+*>                         reserved as workspace of length N*N.
+*>          If JOBU = 'N'  U is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER
+*>          The leading dimension of the array U,  LDU >= 1.
+*>          IF  JOBU = 'U' or 'F' or 'W',  then LDU >= M.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension ( LDV, N )
+*>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
+*>                         the right singular vectors;
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
+*>                         then V is used as workspace if the pprocedure
+*>                         replaces A with A^t. In that case, [U] is computed
+*>                         in V as right singular vectors of A^t and then
+*>                         copied back to the U array. This 'W' option is just
+*>                         a reminder to the caller that in this case V is
+*>                         reserved as workspace of length N*N.
+*>          If JOBV = 'N'  V is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V,  LDV >= 1.
+*>          If JOBV = 'V' or 'J' or 'W', then LDV >= N.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(7,LWORK))
+*>          On exit, if N > 0 .AND. M > 0 (else not referenced),
+*>          WORK(1) = SCALE = WORK(2) / WORK(1) is the scaling factor such
+*>                    that SCALE*SVA(1:N) are the computed singular values
+*>                    of A. (See the description of SVA().)
+*>          WORK(2) = See the description of WORK(1).
+*>          WORK(3) = SCONDA is an estimate for the condition number of
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
+*>                    SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
+*>                    It is computed using DPOCON. It holds
+*>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*>                    where R is the triangular factor from the QRF of A.
+*>                    However, if R is truncated and the numerical rank is
+*>                    determined to be strictly smaller than N, SCONDA is
+*>                    returned as -1, thus indicating that the smallest
+*>                    singular values might be lost.
+*>
+*>          If full SVD is needed, the following two condition numbers are
+*>          useful for the analysis of the algorithm. They are provided for
+*>          a developer/implementer who is familiar with the details of
+*>          the method.
+*>
+*>          WORK(4) = an estimate of the scaled condition number of the
+*>                    triangular factor in the first QR factorization.
+*>          WORK(5) = an estimate of the scaled condition number of the
+*>                    triangular factor in the second QR factorization.
+*>          The following two parameters are computed if JOBT = 'T'.
+*>          They are provided for a developer/implementer who is familiar
+*>          with the details of the method.
+*>
+*>          WORK(6) = the entropy of A^t*A :: this is the Shannon entropy
+*>                    of diag(A^t*A) / Trace(A^t*A) taken as point in the
+*>                    probability simplex.
+*>          WORK(7) = the entropy of A*A^t.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          Length of WORK to confirm proper allocation of work space.
+*>          LWORK depends on the job:
+*>
+*>          If only SIGMA is needed (JOBU = 'N', JOBV = 'N') and
+*>            -> .. no scaled condition estimate required (JOBE = 'N'):
+*>               LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal
+*>               block size for DGEQP3 and DGEQRF.
+*>               In general, optimal LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF), 7).
+*>            -> .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', 'G'). In this case, LWORK is the maximum
+*>               of the above and N*N+4*N, i.e. LWORK >= max(2*M+N,N*N+4*N,7).
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(2*M+N,3*N+(N+1)*NB, N*N+4*N, 7).
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF),
+*>                                                     N+N*N+LWORK(DPOCON),7).
+*>
+*>          If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
+*>            -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
+*>               where NB is the optimal block size for DGEQP3, DGEQRF, DGELQF,
+*>               DORMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3), N+LWORK(DPOCON),
+*>                       N+LWORK(DGELQF), 2*N+LWORK(DGEQRF), N+LWORK(DORMLQ)).
+*>
+*>          If SIGMA and the left singular vectors are needed
+*>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
+*>               if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7),
+*>               where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON),
+*>                        2*N+LWORK(DGEQRF), N+LWORK(DORMQR)).
+*>               Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or
+*>               M*NB (for JOBU = 'F').
+*>
+*>          If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and
+*>            -> if JOBV = 'V'
+*>               the minimal requirement is LWORK >= max(2*M+N,6*N+2*N*N).
+*>            -> if JOBV = 'J' the minimal requirement is
+*>               LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6).
+*>            -> For optimal performance, LWORK should be additionally
+*>               larger than N+M*NB, where NB is the optimal block size
+*>               for DORMQR.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (MAX(3,M+3*N)).
+*>          On exit,
+*>          IWORK(1) = the numerical rank determined after the initial
+*>                     QR factorization with pivoting. See the descriptions
+*>                     of JOBA and JOBR.
+*>          IWORK(2) = the number of the computed nonzero singular values
+*>          IWORK(3) = if nonzero, a warning message:
+*>                     If IWORK(3) = 1 then some of the column norms of A
+*>                     were denormalized floats. The requested high accuracy
+*>                     is not warranted by the data.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  DGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGEsing
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  DGEJSV implements a preconditioned Jacobi SVD algorithm. It uses DGEQP3,
+*>  DGEQRF, and DGELQF as preprocessors and preconditioners. Optionally, an
+*>  additional row pivoting can be used as a preprocessor, which in some
+*>  cases results in much higher accuracy. An example is matrix A with the
+*>  structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned
+*>  diagonal matrices and C is well-conditioned matrix. In that case, complete
+*>  pivoting in the first QR factorizations provides accuracy dependent on the
+*>  condition number of C, and independent of D1, D2. Such higher accuracy is
+*>  not completely understood theoretically, but it works well in practice.
+*>  Further, if A can be written as A = B*D, with well-conditioned B and some
+*>  diagonal D, then the high accuracy is guaranteed, both theoretically and
+*>  in software, independent of D. For more details see [1], [2].
+*>     The computational range for the singular values can be the full range
+*>  ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS
+*>  & LAPACK routines called by DGEJSV are implemented to work in that range.
+*>  If that is not the case, then the restriction for safe computation with
+*>  the singular values in the range of normalized IEEE numbers is that the
+*>  spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not
+*>  overflow. This code (DGEJSV) is best used in this restricted range,
+*>  meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are
+*>  returned as zeros. See JOBR for details on this.
+*>     Further, this implementation is somewhat slower than the one described
+*>  in [1,2] due to replacement of some non-LAPACK components, and because
+*>  the choice of some tuning parameters in the iterative part (DGESVJ) is
+*>  left to the implementer on a particular machine.
+*>     The rank revealing QR factorization (in this code: DGEQP3) should be
+*>  implemented as in [3]. We have a new version of DGEQP3 under development
+*>  that is more robust than the current one in LAPACK, with a cleaner cut in
+*>  rank deficient cases. It will be available in the SIGMA library [4].
+*>  If M is much larger than N, it is obvious that the initial QRF with
+*>  column pivoting can be preprocessed by the QRF without pivoting. That
+*>  well known trick is not used in DGEJSV because in some cases heavy row
+*>  weighting can be treated with complete pivoting. The overhead in cases
+*>  M much larger than N is then only due to pivoting, but the benefits in
+*>  terms of accuracy have prevailed. The implementer/user can incorporate
+*>  this extra QRF step easily. The implementer can also improve data movement
+*>  (matrix transpose, matrix copy, matrix transposed copy) - this
+*>  implementation of DGEJSV uses only the simplest, naive data movement.
+*> \endverbatim
+*
+*> \par Contributors:
+*  ==================
+*>
+*>  Zlatko Drmac (Zagreb, Croatia) and Kresimir Veselic (Hagen, Germany)
+*
+*> \par References:
+*  ================
+*>
+*> \verbatim
+*>
+*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342.
+*>     LAPACK Working note 169.
+*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362.
+*>     LAPACK Working note 170.
+*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR
+*>     factorization software - a case study.
+*>     ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28.
+*>     LAPACK Working note 176.
+*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV,
+*>     QSVD, (H,K)-SVD computations.
+*>     Department of Mathematics, University of Zagreb, 2008.
+*> \endverbatim
+*
+*>  \par Bugs, examples and comments:
+*   =================================
+*>
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*>
+*  =====================================================================
+      SUBROUTINE DGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+     $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
+     $                   WORK, LWORK, IWORK, INFO )
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION A( LDA, * ), SVA( N ), U( LDU, * ), V( LDV, * ),
+     $            WORK( LWORK )
+      INTEGER     IWORK( * )
+      CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*     ..
+*
+*  ===========================================================================
+*
+*     .. Local Parameters ..
+      DOUBLE PRECISION   ZERO,  ONE
+      PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION AAPP, AAQQ, AATMAX, AATMIN, BIG, BIG1, COND_OK,
+     $        CONDR1, CONDR2, ENTRA,  ENTRAT, EPSLN,  MAXPRJ, SCALEM,
+     $        SCONDA, SFMIN,  SMALL,  TEMP1,  USCAL1, USCAL2, XSC
+      INTEGER IERR,   N1,     NR,     NUMRANK,        p, q,   WARNING
+      LOGICAL ALMORT, DEFR,   ERREST, GOSCAL, JRACC,  KILL,   LSVEC,
+     $        L2ABER, L2KILL, L2PERT, L2RANK, L2TRAN,
+     $        NOSCAL, ROWPIV, RSVEC,  TRANSP
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC DABS, DLOG, MAX, MIN, DBLE, IDNINT, DSIGN, DSQRT
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION  DLAMCH, DNRM2
+      INTEGER   IDAMAX
+      LOGICAL   LSAME
+      EXTERNAL  IDAMAX, LSAME, DLAMCH, DNRM2
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL  DCOPY,  DGELQF, DGEQP3, DGEQRF, DLACPY, DLASCL,
+     $          DLASET, DLASSQ, DLASWP, DORGQR, DORMLQ,
+     $          DORMQR, DPOCON, DSCAL,  DSWAP,  DTRSM,  XERBLA
+*
+      EXTERNAL  DGESVJ
+*     ..
+*
+*     Test the input arguments
+*
+      LSVEC  = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' )
+      JRACC  = LSAME( JOBV, 'J' )
+      RSVEC  = LSAME( JOBV, 'V' ) .OR. JRACC
+      ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' )
+      L2RANK = LSAME( JOBA, 'R' )
+      L2ABER = LSAME( JOBA, 'A' )
+      ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' )
+      L2TRAN = LSAME( JOBT, 'T' )
+      L2KILL = LSAME( JOBR, 'R' )
+      DEFR   = LSAME( JOBR, 'N' )
+      L2PERT = LSAME( JOBP, 'P' )
+*
+      IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR.
+     $     ERREST .OR. LSAME( JOBA, 'C' ) )) THEN
+         INFO = - 1
+      ELSE IF ( .NOT.( LSVEC  .OR. LSAME( JOBU, 'N' ) .OR.
+     $                             LSAME( JOBU, 'W' )) ) THEN
+         INFO = - 2
+      ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR.
+     $   LSAME( JOBV, 'W' )) .OR. ( JRACC .AND. (.NOT.LSVEC) ) ) THEN
+         INFO = - 3
+      ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) )    THEN
+         INFO = - 4
+      ELSE IF ( .NOT. ( L2TRAN .OR. LSAME( JOBT, 'N' ) ) ) THEN
+         INFO = - 5
+      ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN
+         INFO = - 6
+      ELSE IF ( M .LT. 0 ) THEN
+         INFO = - 7
+      ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN
+         INFO = - 8
+      ELSE IF ( LDA .LT. M ) THEN
+         INFO = - 10
+      ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN
+         INFO = - 13
+      ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN
+         INFO = - 15
+      ELSE IF ( (.NOT.(LSVEC .OR. RSVEC .OR. ERREST).AND.
+     &                           (LWORK .LT. MAX(7,4*N+1,2*M+N))) .OR.
+     & (.NOT.(LSVEC .OR. RSVEC) .AND. ERREST .AND.
+     &                         (LWORK .LT. MAX(7,4*N+N*N,2*M+N))) .OR.
+     & (LSVEC .AND. (.NOT.RSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1)))
+     & .OR.
+     & (RSVEC .AND. (.NOT.LSVEC) .AND. (LWORK .LT. MAX(7,2*M+N,4*N+1)))
+     & .OR.
+     & (LSVEC .AND. RSVEC .AND. (.NOT.JRACC) .AND.
+     &                          (LWORK.LT.MAX(2*M+N,6*N+2*N*N)))
+     & .OR. (LSVEC .AND. RSVEC .AND. JRACC .AND.
+     &                          LWORK.LT.MAX(2*M+N,4*N+N*N,2*N+N*N+6)))
+     &   THEN
+         INFO = - 17
+      ELSE
+*        #:)
+         INFO = 0
+      END IF
+*
+      IF ( INFO .NE. 0 ) THEN
+*       #:(
+         CALL XERBLA( 'DGEJSV', - INFO )
+         RETURN
+      END IF
+*
+*     Quick return for void matrix (Y3K safe)
+* #:)
+      IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN
+         IWORK(1:3) = 0
+         WORK(1:7) = 0
+         RETURN
+      ENDIF
+*
+*     Determine whether the matrix U should be M x N or M x M
+*
+      IF ( LSVEC ) THEN
+         N1 = N
+         IF ( LSAME( JOBU, 'F' ) ) N1 = M
+      END IF
+*
+*     Set numerical parameters
+*
+*!    NOTE: Make sure DLAMCH() does not fail on the target architecture.
+*
+      EPSLN = DLAMCH('Epsilon')
+      SFMIN = DLAMCH('SafeMinimum')
+      SMALL = SFMIN / EPSLN
+      BIG   = DLAMCH('O')
+*     BIG   = ONE / SFMIN
+*
+*     Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
+*
+*(!)  If necessary, scale SVA() to protect the largest norm from
+*     overflow. It is possible that this scaling pushes the smallest
+*     column norm left from the underflow threshold (extreme case).
+*
+      SCALEM  = ONE / DSQRT(DBLE(M)*DBLE(N))
+      NOSCAL  = .TRUE.
+      GOSCAL  = .TRUE.
+      DO 1874 p = 1, N
+         AAPP = ZERO
+         AAQQ = ONE
+         CALL DLASSQ( M, A(1,p), 1, AAPP, AAQQ )
+         IF ( AAPP .GT. BIG ) THEN
+            INFO = - 9
+            CALL XERBLA( 'DGEJSV', -INFO )
+            RETURN
+         END IF
+         AAQQ = DSQRT(AAQQ)
+         IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL  ) THEN
+            SVA(p)  = AAPP * AAQQ
+         ELSE
+            NOSCAL  = .FALSE.
+            SVA(p)  = AAPP * ( AAQQ * SCALEM )
+            IF ( GOSCAL ) THEN
+               GOSCAL = .FALSE.
+               CALL DSCAL( p-1, SCALEM, SVA, 1 )
+            END IF
+         END IF
+ 1874 CONTINUE
+*
+      IF ( NOSCAL ) SCALEM = ONE
+*
+      AAPP = ZERO
+      AAQQ = BIG
+      DO 4781 p = 1, N
+         AAPP = MAX( AAPP, SVA(p) )
+         IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) )
+ 4781 CONTINUE
+*
+*     Quick return for zero M x N matrix
+* #:)
+      IF ( AAPP .EQ. ZERO ) THEN
+         IF ( LSVEC ) CALL DLASET( 'G', M, N1, ZERO, ONE, U, LDU )
+         IF ( RSVEC ) CALL DLASET( 'G', N, N,  ZERO, ONE, V, LDV )
+         WORK(1) = ONE
+         WORK(2) = ONE
+         IF ( ERREST ) WORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            WORK(4) = ONE
+            WORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            WORK(6) = ZERO
+            WORK(7) = ZERO
+         END IF
+         IWORK(1) = 0
+         IWORK(2) = 0
+         IWORK(3) = 0
+         RETURN
+      END IF
+*
+*     Issue warning if denormalized column norms detected. Override the
+*     high relative accuracy request. Issue licence to kill columns
+*     (set them to zero) whose norm is less than sigma_max / BIG (roughly).
+* #:(
+      WARNING = 0
+      IF ( AAQQ .LE. SFMIN ) THEN
+         L2RANK = .TRUE.
+         L2KILL = .TRUE.
+         WARNING = 1
+      END IF
+*
+*     Quick return for one-column matrix
+* #:)
+      IF ( N .EQ. 1 ) THEN
+*
+         IF ( LSVEC ) THEN
+            CALL DLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR )
+            CALL DLACPY( 'A', M, 1, A, LDA, U, LDU )
+*           computing all M left singular vectors of the M x 1 matrix
+            IF ( N1 .NE. N  ) THEN
+               CALL DGEQRF( M, N, U,LDU, WORK, WORK(N+1),LWORK-N,IERR )
+               CALL DORGQR( M,N1,1, U,LDU,WORK,WORK(N+1),LWORK-N,IERR )
+               CALL DCOPY( M, A(1,1), 1, U(1,1), 1 )
+            END IF
+         END IF
+         IF ( RSVEC ) THEN
+             V(1,1) = ONE
+         END IF
+         IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN
+            SVA(1)  = SVA(1) / SCALEM
+            SCALEM  = ONE
+         END IF
+         WORK(1) = ONE / SCALEM
+         WORK(2) = ONE
+         IF ( SVA(1) .NE. ZERO ) THEN
+            IWORK(1) = 1
+            IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN
+               IWORK(2) = 1
+            ELSE
+               IWORK(2) = 0
+            END IF
+         ELSE
+            IWORK(1) = 0
+            IWORK(2) = 0
+         END IF
+         IWORK(3) = 0
+         IF ( ERREST ) WORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            WORK(4) = ONE
+            WORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            WORK(6) = ZERO
+            WORK(7) = ZERO
+         END IF
+         RETURN
+*
+      END IF
+*
+      TRANSP = .FALSE.
+      L2TRAN = L2TRAN .AND. ( M .EQ. N )
+*
+      AATMAX = -ONE
+      AATMIN =  BIG
+      IF ( ROWPIV .OR. L2TRAN ) THEN
+*
+*     Compute the row norms, needed to determine row pivoting sequence
+*     (in the case of heavily row weighted A, row pivoting is strongly
+*     advised) and to collect information needed to compare the
+*     structures of A * A^t and A^t * A (in the case L2TRAN.EQ..TRUE.).
+*
+         IF ( L2TRAN ) THEN
+            DO 1950 p = 1, M
+               XSC   = ZERO
+               TEMP1 = ONE
+               CALL DLASSQ( N, A(p,1), LDA, XSC, TEMP1 )
+*              DLASSQ gets both the ell_2 and the ell_infinity norm
+*              in one pass through the vector
+               WORK(M+N+p)  = XSC * SCALEM
+               WORK(N+p)    = XSC * (SCALEM*DSQRT(TEMP1))
+               AATMAX = MAX( AATMAX, WORK(N+p) )
+               IF (WORK(N+p) .NE. ZERO) AATMIN = MIN(AATMIN,WORK(N+p))
+ 1950       CONTINUE
+         ELSE
+            DO 1904 p = 1, M
+               WORK(M+N+p) = SCALEM*DABS( A(p,IDAMAX(N,A(p,1),LDA)) )
+               AATMAX = MAX( AATMAX, WORK(M+N+p) )
+               AATMIN = MIN( AATMIN, WORK(M+N+p) )
+ 1904       CONTINUE
+         END IF
+*
+      END IF
+*
+*     For square matrix A try to determine whether A^t  would be  better
+*     input for the preconditioned Jacobi SVD, with faster convergence.
+*     The decision is based on an O(N) function of the vector of column
+*     and row norms of A, based on the Shannon entropy. This should give
+*     the right choice in most cases when the difference actually matters.
+*     It may fail and pick the slower converging side.
+*
+      ENTRA  = ZERO
+      ENTRAT = ZERO
+      IF ( L2TRAN ) THEN
+*
+         XSC   = ZERO
+         TEMP1 = ONE
+         CALL DLASSQ( N, SVA, 1, XSC, TEMP1 )
+         TEMP1 = ONE / TEMP1
+*
+         ENTRA = ZERO
+         DO 1113 p = 1, N
+            BIG1  = ( ( SVA(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1)
+ 1113    CONTINUE
+         ENTRA = - ENTRA / DLOG(DBLE(N))
+*
+*        Now, SVA().^2/Trace(A^t * A) is a point in the probability simplex.
+*        It is derived from the diagonal of  A^t * A.  Do the same with the
+*        diagonal of A * A^t, compute the entropy of the corresponding
+*        probability distribution. Note that A * A^t and A^t * A have the
+*        same trace.
+*
+         ENTRAT = ZERO
+         DO 1114 p = N+1, N+M
+            BIG1 = ( ( WORK(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1)
+ 1114    CONTINUE
+         ENTRAT = - ENTRAT / DLOG(DBLE(M))
+*
+*        Analyze the entropies and decide A or A^t. Smaller entropy
+*        usually means better input for the algorithm.
+*
+         TRANSP = ( ENTRAT .LT. ENTRA )
+*
+*        If A^t is better than A, transpose A.
+*
+         IF ( TRANSP ) THEN
+*           In an optimal implementation, this trivial transpose
+*           should be replaced with faster transpose.
+            DO 1115 p = 1, N - 1
+               DO 1116 q = p + 1, N
+                   TEMP1 = A(q,p)
+                  A(q,p) = A(p,q)
+                  A(p,q) = TEMP1
+ 1116          CONTINUE
+ 1115       CONTINUE
+            DO 1117 p = 1, N
+               WORK(M+N+p) = SVA(p)
+               SVA(p)      = WORK(N+p)
+ 1117       CONTINUE
+            TEMP1  = AAPP
+            AAPP   = AATMAX
+            AATMAX = TEMP1
+            TEMP1  = AAQQ
+            AAQQ   = AATMIN
+            AATMIN = TEMP1
+            KILL   = LSVEC
+            LSVEC  = RSVEC
+            RSVEC  = KILL
+            IF ( LSVEC ) N1 = N
+*
+            ROWPIV = .TRUE.
+         END IF
+*
+      END IF
+*     END IF L2TRAN
+*
+*     Scale the matrix so that its maximal singular value remains less
+*     than DSQRT(BIG) -- the matrix is scaled so that its maximal column
+*     has Euclidean norm equal to DSQRT(BIG/N). The only reason to keep
+*     DSQRT(BIG) instead of BIG is the fact that DGEJSV uses LAPACK and
+*     BLAS routines that, in some implementations, are not capable of
+*     working in the full interval [SFMIN,BIG] and that they may provoke
+*     overflows in the intermediate results. If the singular values spread
+*     from SFMIN to BIG, then DGESVJ will compute them. So, in that case,
+*     one should use DGESVJ instead of DGEJSV.
+*
+      BIG1   = DSQRT( BIG )
+      TEMP1  = DSQRT( BIG / DBLE(N) )
+*
+      CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR )
+      IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN
+          AAQQ = ( AAQQ / AAPP ) * TEMP1
+      ELSE
+          AAQQ = ( AAQQ * TEMP1 ) / AAPP
+      END IF
+      TEMP1 = TEMP1 * SCALEM
+      CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR )
+*
+*     To undo scaling at the end of this procedure, multiply the
+*     computed singular values with USCAL2 / USCAL1.
+*
+      USCAL1 = TEMP1
+      USCAL2 = AAPP
+*
+      IF ( L2KILL ) THEN
+*        L2KILL enforces computation of nonzero singular values in
+*        the restricted range of condition number of the initial A,
+*        sigma_max(A) / sigma_min(A) approx. DSQRT(BIG)/DSQRT(SFMIN).
+         XSC = DSQRT( SFMIN )
+      ELSE
+         XSC = SMALL
+*
+*        Now, if the condition number of A is too big,
+*        sigma_max(A) / sigma_min(A) .GT. DSQRT(BIG/N) * EPSLN / SFMIN,
+*        as a precaution measure, the full SVD is computed using DGESVJ
+*        with accumulated Jacobi rotations. This provides numerically
+*        more robust computation, at the cost of slightly increased run
+*        time. Depending on the concrete implementation of BLAS and LAPACK
+*        (i.e. how they behave in presence of extreme ill-conditioning) the
+*        implementor may decide to remove this switch.
+         IF ( ( AAQQ.LT.DSQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN
+            JRACC = .TRUE.
+         END IF
+*
+      END IF
+      IF ( AAQQ .LT. XSC ) THEN
+         DO 700 p = 1, N
+            IF ( SVA(p) .LT. XSC ) THEN
+               CALL DLASET( 'A', M, 1, ZERO, ZERO, A(1,p), LDA )
+               SVA(p) = ZERO
+            END IF
+ 700     CONTINUE
+      END IF
+*
+*     Preconditioning using QR factorization with pivoting
+*
+      IF ( ROWPIV ) THEN
+*        Optional row permutation (Bjoerck row pivoting):
+*        A result by Cox and Higham shows that the Bjoerck's
+*        row pivoting combined with standard column pivoting
+*        has similar effect as Powell-Reid complete pivoting.
+*        The ell-infinity norms of A are made nonincreasing.
+         DO 1952 p = 1, M - 1
+            q = IDAMAX( M-p+1, WORK(M+N+p), 1 ) + p - 1
+            IWORK(2*N+p) = q
+            IF ( p .NE. q ) THEN
+               TEMP1       = WORK(M+N+p)
+               WORK(M+N+p) = WORK(M+N+q)
+               WORK(M+N+q) = TEMP1
+            END IF
+ 1952    CONTINUE
+         CALL DLASWP( N, A, LDA, 1, M-1, IWORK(2*N+1), 1 )
+      END IF
+*
+*     End of the preparation phase (scaling, optional sorting and
+*     transposing, optional flushing of small columns).
+*
+*     Preconditioning
+*
+*     If the full SVD is needed, the right singular vectors are computed
+*     from a matrix equation, and for that we need theoretical analysis
+*     of the Businger-Golub pivoting. So we use DGEQP3 as the first RR QRF.
+*     In all other cases the first RR QRF can be chosen by other criteria
+*     (eg speed by replacing global with restricted window pivoting, such
+*     as in SGEQPX from TOMS # 782). Good results will be obtained using
+*     SGEQPX with properly (!) chosen numerical parameters.
+*     Any improvement of DGEQP3 improves overall performance of DGEJSV.
+*
+*     A * P1 = Q1 * [ R1^t 0]^t:
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL DGEQP3( M,N,A,LDA, IWORK,WORK, WORK(N+1),LWORK-N, IERR )
+*
+*     The upper triangular matrix R1 from the first QRF is inspected for
+*     rank deficiency and possibilities for deflation, or possible
+*     ill-conditioning. Depending on the user specified flag L2RANK,
+*     the procedure explores possibilities to reduce the numerical
+*     rank by inspecting the computed upper triangular factor. If
+*     L2RANK or L2ABER are up, then DGEJSV will compute the SVD of
+*     A + dA, where ||dA|| <= f(M,N)*EPSLN.
+*
+      NR = 1
+      IF ( L2ABER ) THEN
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPSLN*||A||.
+         TEMP1 = DSQRT(DBLE(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( DABS(A(p,p)) .GE. (TEMP1*DABS(A(1,1))) ) THEN
+               NR = NR + 1
+            ELSE
+               GO TO 3002
+            END IF
+ 3001    CONTINUE
+ 3002    CONTINUE
+      ELSE IF ( L2RANK ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R1 is used as the criterion for
+*        close-to-rank-deficient.
+         TEMP1 = DSQRT(SFMIN)
+         DO 3401 p = 2, N
+            IF ( ( DABS(A(p,p)) .LT. (EPSLN*DABS(A(p-1,p-1))) ) .OR.
+     $           ( DABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        The goal is high relative accuracy. However, if the matrix
+*        has high scaled condition number the relative accuracy is in
+*        general not feasible. Later on, a condition number estimator
+*        will be deployed to estimate the scaled condition number.
+*        Here we just remove the underflowed part of the triangular
+*        factor. This prevents the situation in which the code is
+*        working hard to get the accuracy not warranted by the data.
+         TEMP1  = DSQRT(SFMIN)
+         DO 3301 p = 2, N
+            IF ( ( DABS(A(p,p)) .LT. SMALL ) .OR.
+     $          ( L2KILL .AND. (DABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302
+            NR = NR + 1
+ 3301    CONTINUE
+ 3302    CONTINUE
+*
+      END IF
+*
+      ALMORT = .FALSE.
+      IF ( NR .EQ. N ) THEN
+         MAXPRJ = ONE
+         DO 3051 p = 2, N
+            TEMP1  = DABS(A(p,p)) / SVA(IWORK(p))
+            MAXPRJ = MIN( MAXPRJ, TEMP1 )
+ 3051    CONTINUE
+         IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE.
+      END IF
+*
+*
+      SCONDA = - ONE
+      CONDR1 = - ONE
+      CONDR2 = - ONE
+*
+      IF ( ERREST ) THEN
+         IF ( N .EQ. NR ) THEN
+            IF ( RSVEC ) THEN
+*              .. V is available as workspace
+               CALL DLACPY( 'U', N, N, A, LDA, V, LDV )
+               DO 3053 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL DSCAL( p, ONE/TEMP1, V(1,p), 1 )
+ 3053          CONTINUE
+               CALL DPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $              WORK(N+1), IWORK(2*N+M+1), IERR )
+            ELSE IF ( LSVEC ) THEN
+*              .. U is available as workspace
+               CALL DLACPY( 'U', N, N, A, LDA, U, LDU )
+               DO 3054 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL DSCAL( p, ONE/TEMP1, U(1,p), 1 )
+ 3054          CONTINUE
+               CALL DPOCON( 'U', N, U, LDU, ONE, TEMP1,
+     $              WORK(N+1), IWORK(2*N+M+1), IERR )
+            ELSE
+               CALL DLACPY( 'U', N, N, A, LDA, WORK(N+1), N )
+               DO 3052 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL DSCAL( p, ONE/TEMP1, WORK(N+(p-1)*N+1), 1 )
+ 3052          CONTINUE
+*           .. the columns of R are scaled to have unit Euclidean lengths.
+               CALL DPOCON( 'U', N, WORK(N+1), N, ONE, TEMP1,
+     $              WORK(N+N*N+1), IWORK(2*N+M+1), IERR )
+            END IF
+            SCONDA = ONE / DSQRT(TEMP1)
+*           SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+         ELSE
+            SCONDA = - ONE
+         END IF
+      END IF
+*
+      L2PERT = L2PERT .AND. ( DABS( A(1,1)/A(NR,NR) ) .GT. DSQRT(BIG1) )
+*     If there is no violent scaling, artificial perturbation is not needed.
+*
+*     Phase 3:
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*
+*         Singular Values only
+*
+*         .. transpose A(1:NR,1:N)
+         DO 1946 p = 1, MIN( N-1, NR )
+            CALL DCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 )
+ 1946    CONTINUE
+*
+*        The following two DO-loops introduce small relative perturbation
+*        into the strict upper triangle of the lower triangular matrix.
+*        Small entries below the main diagonal are also changed.
+*        This modification is useful if the computing environment does not
+*        provide/allow FLUSH TO ZERO underflow, for it prevents many
+*        annoying denormalized numbers in case of strongly scaled matrices.
+*        The perturbation is structured so that it does not introduce any
+*        new perturbation of the singular values, and it does not destroy
+*        the job done by the preconditioner.
+*        The licence for this perturbation is in the variable L2PERT, which
+*        should be .FALSE. if FLUSH TO ZERO underflow is active.
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+            IF ( L2PERT ) THEN
+*              XSC = DSQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 4947 q = 1, NR
+                  TEMP1 = XSC*DABS(A(q,q))
+                  DO 4949 p = 1, N
+                     IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) )
+     $                    .OR. ( p .LT. q ) )
+     $                     A(p,q) = DSIGN( TEMP1, A(p,q) )
+ 4949             CONTINUE
+ 4947          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, A(1,2),LDA )
+            END IF
+*
+*            .. second preconditioning using the QR factorization
+*
+            CALL DGEQRF( N,NR, A,LDA, WORK, WORK(N+1),LWORK-N, IERR )
+*
+*           .. and transpose upper to lower triangular
+            DO 1948 p = 1, NR - 1
+               CALL DCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 )
+ 1948       CONTINUE
+*
+         END IF
+*
+*           Row-cyclic Jacobi SVD algorithm with column pivoting
+*
+*           .. again some perturbation (a "background noise") is added
+*           to drown denormals
+            IF ( L2PERT ) THEN
+*              XSC = DSQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 1947 q = 1, NR
+                  TEMP1 = XSC*DABS(A(q,q))
+                  DO 1949 p = 1, NR
+                     IF ( ( (p.GT.q) .AND. (DABS(A(p,q)).LE.TEMP1) )
+     $                       .OR. ( p .LT. q ) )
+     $                   A(p,q) = DSIGN( TEMP1, A(p,q) )
+ 1949             CONTINUE
+ 1947          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, A(1,2), LDA )
+            END IF
+*
+*           .. and one-sided Jacobi rotations are started on a lower
+*           triangular matrix (plus perturbation which is ignored in
+*           the part which destroys triangular form (confusing?!))
+*
+            CALL DGESVJ( 'L', 'NoU', 'NoV', NR, NR, A, LDA, SVA,
+     $                      N, V, LDV, WORK, LWORK, INFO )
+*
+            SCALEM  = WORK(1)
+            NUMRANK = IDNINT(WORK(2))
+*
+*
+      ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN
+*
+*        -> Singular Values and Right Singular Vectors <-
+*
+         IF ( ALMORT ) THEN
+*
+*           .. in this case NR equals N
+            DO 1998 p = 1, NR
+               CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+ 1998       CONTINUE
+            CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+*
+            CALL DGESVJ( 'L','U','N', N, NR, V,LDV, SVA, NR, A,LDA,
+     $                  WORK, LWORK, INFO )
+            SCALEM  = WORK(1)
+            NUMRANK = IDNINT(WORK(2))
+
+         ELSE
+*
+*        .. two more QR factorizations ( one QRF is not enough, two require
+*        accumulated product of Jacobi rotations, three are perfect )
+*
+            CALL DLASET( 'Lower', NR-1, NR-1, ZERO, ZERO, A(2,1), LDA )
+            CALL DGELQF( NR, N, A, LDA, WORK, WORK(N+1), LWORK-N, IERR)
+            CALL DLACPY( 'Lower', NR, NR, A, LDA, V, LDV )
+            CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+            CALL DGEQRF( NR, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+     $                   LWORK-2*N, IERR )
+            DO 8998 p = 1, NR
+               CALL DCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 )
+ 8998       CONTINUE
+            CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+*
+            CALL DGESVJ( 'Lower', 'U','N', NR, NR, V,LDV, SVA, NR, U,
+     $                  LDU, WORK(N+1), LWORK, INFO )
+            SCALEM  = WORK(N+1)
+            NUMRANK = IDNINT(WORK(N+2))
+            IF ( NR .LT. N ) THEN
+               CALL DLASET( 'A',N-NR, NR, ZERO,ZERO, V(NR+1,1),   LDV )
+               CALL DLASET( 'A',NR, N-NR, ZERO,ZERO, V(1,NR+1),   LDV )
+               CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE, V(NR+1,NR+1), LDV )
+            END IF
+*
+         CALL DORMLQ( 'Left', 'Transpose', N, N, NR, A, LDA, WORK,
+     $               V, LDV, WORK(N+1), LWORK-N, IERR )
+*
+         END IF
+*
+         DO 8991 p = 1, N
+            CALL DCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA )
+ 8991    CONTINUE
+         CALL DLACPY( 'All', N, N, A, LDA, V, LDV )
+*
+         IF ( TRANSP ) THEN
+            CALL DLACPY( 'All', N, N, V, LDV, U, LDU )
+         END IF
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN
+*
+*        .. Singular Values and Left Singular Vectors                 ..
+*
+*        .. second preconditioning step to avoid need to accumulate
+*        Jacobi rotations in the Jacobi iterations.
+         DO 1965 p = 1, NR
+            CALL DCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 )
+ 1965    CONTINUE
+         CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU )
+*
+         CALL DGEQRF( N, NR, U, LDU, WORK(N+1), WORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+         DO 1967 p = 1, NR - 1
+            CALL DCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 )
+ 1967    CONTINUE
+         CALL DLASET( 'Upper', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU )
+*
+         CALL DGESVJ( 'Lower', 'U', 'N', NR,NR, U, LDU, SVA, NR, A,
+     $        LDA, WORK(N+1), LWORK-N, INFO )
+         SCALEM  = WORK(N+1)
+         NUMRANK = IDNINT(WORK(N+2))
+*
+         IF ( NR .LT. M ) THEN
+            CALL DLASET( 'A',  M-NR, NR,ZERO, ZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL DLASET( 'A',NR, N1-NR, ZERO, ZERO, U(1,NR+1), LDU )
+               CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1), LDU )
+            END IF
+         END IF
+*
+         CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U,
+     $               LDU, WORK(N+1), LWORK-N, IERR )
+*
+         IF ( ROWPIV )
+     $       CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+         DO 1974 p = 1, N1
+            XSC = ONE / DNRM2( M, U(1,p), 1 )
+            CALL DSCAL( M, XSC, U(1,p), 1 )
+ 1974    CONTINUE
+*
+         IF ( TRANSP ) THEN
+            CALL DLACPY( 'All', N, N, U, LDU, V, LDV )
+         END IF
+*
+      ELSE
+*
+*        .. Full SVD ..
+*
+         IF ( .NOT. JRACC ) THEN
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+*           Second Preconditioning Step (QRF [with pivoting])
+*           Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
+*           equivalent to an LQF CALL. Since in many libraries the QRF
+*           seems to be better optimized than the LQF, we do explicit
+*           transpose and use the QRF. This is subject to changes in an
+*           optimized implementation of DGEJSV.
+*
+            DO 1968 p = 1, NR
+               CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+ 1968       CONTINUE
+*
+*           .. the following two loops perturb small entries to avoid
+*           denormals in the second QR factorization, where they are
+*           as good as zeros. This is done to avoid painfully slow
+*           computation with denormals. The relative size of the perturbation
+*           is a parameter that can be changed by the implementer.
+*           This perturbation device will be obsolete on machines with
+*           properly implemented arithmetic.
+*           To switch it off, set L2PERT=.FALSE. To remove it from  the
+*           code, remove the action under L2PERT=.TRUE., leave the ELSE part.
+*           The following two loops should be blocked and fused with the
+*           transposed copy above.
+*
+            IF ( L2PERT ) THEN
+               XSC = DSQRT(SMALL)
+               DO 2969 q = 1, NR
+                  TEMP1 = XSC*DABS( V(q,q) )
+                  DO 2968 p = 1, N
+                     IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 )
+     $                   .OR. ( p .LT. q ) )
+     $                   V(p,q) = DSIGN( TEMP1, V(p,q) )
+                     IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 2968             CONTINUE
+ 2969          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+            END IF
+*
+*           Estimate the row scaled condition number of R1
+*           (If R1 is rectangular, N > NR, then the condition number
+*           of the leading NR x NR submatrix is estimated.)
+*
+            CALL DLACPY( 'L', NR, NR, V, LDV, WORK(2*N+1), NR )
+            DO 3950 p = 1, NR
+               TEMP1 = DNRM2(NR-p+1,WORK(2*N+(p-1)*NR+p),1)
+               CALL DSCAL(NR-p+1,ONE/TEMP1,WORK(2*N+(p-1)*NR+p),1)
+ 3950       CONTINUE
+            CALL DPOCON('Lower',NR,WORK(2*N+1),NR,ONE,TEMP1,
+     $                   WORK(2*N+NR*NR+1),IWORK(M+2*N+1),IERR)
+            CONDR1 = ONE / DSQRT(TEMP1)
+*           .. here need a second opinion on the condition number
+*           .. then assume worst case scenario
+*           R1 is OK for inverse <=> CONDR1 .LT. DBLE(N)
+*           more conservative    <=> CONDR1 .LT. DSQRT(DBLE(N))
+*
+            COND_OK = DSQRT(DBLE(NR))
+*[TP]       COND_OK is a tuning parameter.
+
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*              .. the second QRF without pivoting. Note: in an optimized
+*              implementation, this QRF should be implemented as the QRF
+*              of a lower triangular matrix.
+*              R1^t = Q2 * R2
+               CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+               IF ( L2PERT ) THEN
+                  XSC = DSQRT(SMALL)/EPSLN
+                  DO 3959 p = 2, NR
+                     DO 3958 q = 1, p - 1
+                        TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q)))
+                        IF ( DABS(V(q,p)) .LE. TEMP1 )
+     $                     V(q,p) = DSIGN( TEMP1, V(q,p) )
+ 3958                CONTINUE
+ 3959             CONTINUE
+               END IF
+*
+               IF ( NR .NE. N )
+     $         CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N )
+*              .. save ...
+*
+*           .. this transposed copy should be better than naive
+               DO 1969 p = 1, NR - 1
+                  CALL DCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 )
+ 1969          CONTINUE
+*
+               CONDR2 = CONDR1
+*
+            ELSE
+*
+*              .. ill-conditioned case: second QRF with pivoting
+*              Note that windowed pivoting would be equally good
+*              numerically, and more run-time efficient. So, in
+*              an optimal implementation, the next call to DGEQP3
+*              should be replaced with eg. CALL SGEQPX (ACM TOMS #782)
+*              with properly (carefully) chosen parameters.
+*
+*              R1^t * P2 = Q2 * R2
+               DO 3003 p = 1, NR
+                  IWORK(N+p) = 0
+ 3003          CONTINUE
+               CALL DGEQP3( N, NR, V, LDV, IWORK(N+1), WORK(N+1),
+     $                  WORK(2*N+1), LWORK-2*N, IERR )
+**               CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+**     $              LWORK-2*N, IERR )
+               IF ( L2PERT ) THEN
+                  XSC = DSQRT(SMALL)
+                  DO 3969 p = 2, NR
+                     DO 3968 q = 1, p - 1
+                        TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q)))
+                        IF ( DABS(V(q,p)) .LE. TEMP1 )
+     $                     V(q,p) = DSIGN( TEMP1, V(q,p) )
+ 3968                CONTINUE
+ 3969             CONTINUE
+               END IF
+*
+               CALL DLACPY( 'A', N, NR, V, LDV, WORK(2*N+1), N )
+*
+               IF ( L2PERT ) THEN
+                  XSC = DSQRT(SMALL)
+                  DO 8970 p = 2, NR
+                     DO 8971 q = 1, p - 1
+                        TEMP1 = XSC * MIN(DABS(V(p,p)),DABS(V(q,q)))
+                        V(p,q) = - DSIGN( TEMP1, V(q,p) )
+ 8971                CONTINUE
+ 8970             CONTINUE
+               ELSE
+                  CALL DLASET( 'L',NR-1,NR-1,ZERO,ZERO,V(2,1),LDV )
+               END IF
+*              Now, compute R2 = L3 * Q3, the LQ factorization.
+               CALL DGELQF( NR, NR, V, LDV, WORK(2*N+N*NR+1),
+     $               WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR )
+*              .. and estimate the condition number
+               CALL DLACPY( 'L',NR,NR,V,LDV,WORK(2*N+N*NR+NR+1),NR )
+               DO 4950 p = 1, NR
+                  TEMP1 = DNRM2( p, WORK(2*N+N*NR+NR+p), NR )
+                  CALL DSCAL( p, ONE/TEMP1, WORK(2*N+N*NR+NR+p), NR )
+ 4950          CONTINUE
+               CALL DPOCON( 'L',NR,WORK(2*N+N*NR+NR+1),NR,ONE,TEMP1,
+     $              WORK(2*N+N*NR+NR+NR*NR+1),IWORK(M+2*N+1),IERR )
+               CONDR2 = ONE / DSQRT(TEMP1)
+*
+               IF ( CONDR2 .GE. COND_OK ) THEN
+*                 .. save the Householder vectors used for Q3
+*                 (this overwrites the copy of R2, as it will not be
+*                 needed in this branch, but it does not overwritte the
+*                 Huseholder vectors of Q2.).
+                  CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
+*                 .. and the rest of the information on Q3 is in
+*                 WORK(2*N+N*NR+1:2*N+N*NR+N)
+               END IF
+*
+            END IF
+*
+            IF ( L2PERT ) THEN
+               XSC = DSQRT(SMALL)
+               DO 4968 q = 2, NR
+                  TEMP1 = XSC * V(q,q)
+                  DO 4969 p = 1, q - 1
+*                    V(p,q) = - DSIGN( TEMP1, V(q,p) )
+                     V(p,q) = - DSIGN( TEMP1, V(p,q) )
+ 4969             CONTINUE
+ 4968          CONTINUE
+            ELSE
+               CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV )
+            END IF
+*
+*        Second preconditioning finished; continue with Jacobi SVD
+*        The input matrix is lower trinagular.
+*
+*        Recover the right singular vectors as solution of a well
+*        conditioned triangular matrix equation.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*
+               CALL DGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U,
+     $              LDU,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,INFO )
+               SCALEM  = WORK(2*N+N*NR+NR+1)
+               NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2))
+               DO 3970 p = 1, NR
+                  CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL DSCAL( NR, SVA(p),    V(1,p), 1 )
+ 3970          CONTINUE
+
+*        .. pick the right matrix equation and solve it
+*
+               IF ( NR .EQ. N ) THEN
+* :))             .. best case, R1 is inverted. The solution of this matrix
+*                 equation is Q2*V2 = the product of the Jacobi rotations
+*                 used in DGESVJ, premultiplied with the orthogonal matrix
+*                 from the second QR factorization.
+                  CALL DTRSM( 'L','U','N','N', NR,NR,ONE, A,LDA, V,LDV )
+               ELSE
+*                 .. R1 is well conditioned, but non-square. Transpose(R2)
+*                 is inverted to get the product of the Jacobi rotations
+*                 used in DGESVJ. The Q-factor from the second QR
+*                 factorization is then built in explicitly.
+                  CALL DTRSM('L','U','T','N',NR,NR,ONE,WORK(2*N+1),
+     $                 N,V,LDV)
+                  IF ( NR .LT. N ) THEN
+                    CALL DLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV)
+                    CALL DLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV)
+                    CALL DLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV)
+                  END IF
+                  CALL DORMQR('L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $                 V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR)
+               END IF
+*
+            ELSE IF ( CONDR2 .LT. COND_OK ) THEN
+*
+* :)           .. the input matrix A is very likely a relative of
+*              the Kahan matrix :)
+*              The matrix R2 is inverted. The solution of the matrix equation
+*              is Q3^T*V3 = the product of the Jacobi rotations (appplied to
+*              the lower triangular L3 from the LQ factorization of
+*              R2=L3*Q3), pre-multiplied with the transposed Q3.
+               CALL DGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
+               SCALEM  = WORK(2*N+N*NR+NR+1)
+               NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2))
+               DO 3870 p = 1, NR
+                  CALL DCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL DSCAL( NR, SVA(p),    U(1,p), 1 )
+ 3870          CONTINUE
+               CALL DTRSM('L','U','N','N',NR,NR,ONE,WORK(2*N+1),N,U,LDU)
+*              .. apply the permutation from the second QR factorization
+               DO 873 q = 1, NR
+                  DO 872 p = 1, NR
+                     WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 872              CONTINUE
+                  DO 874 p = 1, NR
+                     U(p,q) = WORK(2*N+N*NR+NR+p)
+ 874              CONTINUE
+ 873           CONTINUE
+               IF ( NR .LT. N ) THEN
+                  CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV )
+                  CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV )
+                  CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV )
+               END IF
+               CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $              V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+            ELSE
+*              Last line of defense.
+* #:(          This is a rather pathological case: no scaled condition
+*              improvement after two pivoted QR factorizations. Other
+*              possibility is that the rank revealing QR factorization
+*              or the condition estimator has failed, or the COND_OK
+*              is set very close to ONE (which is unnecessary). Normally,
+*              this branch should never be executed, but in rare cases of
+*              failure of the RRQR or condition estimator, the last line of
+*              defense ensures that DGEJSV completes the task.
+*              Compute the full SVD of L3 using DGESVJ with explicit
+*              accumulation of Jacobi rotations.
+               CALL DGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, WORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, INFO )
+               SCALEM  = WORK(2*N+N*NR+NR+1)
+               NUMRANK = IDNINT(WORK(2*N+N*NR+NR+2))
+               IF ( NR .LT. N ) THEN
+                  CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV )
+                  CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV )
+                  CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV )
+               END IF
+               CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $              V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+               CALL DORMLQ( 'L', 'T', NR, NR, NR, WORK(2*N+1), N,
+     $              WORK(2*N+N*NR+1), U, LDU, WORK(2*N+N*NR+NR+1),
+     $              LWORK-2*N-N*NR-NR, IERR )
+               DO 773 q = 1, NR
+                  DO 772 p = 1, NR
+                     WORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 772              CONTINUE
+                  DO 774 p = 1, NR
+                     U(p,q) = WORK(2*N+N*NR+NR+p)
+ 774              CONTINUE
+ 773           CONTINUE
+*
+            END IF
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = DSQRT(DBLE(N)) * EPSLN
+            DO 1972 q = 1, N
+               DO 972 p = 1, N
+                  WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+  972          CONTINUE
+               DO 973 p = 1, N
+                  V(p,q) = WORK(2*N+N*NR+NR+p)
+  973          CONTINUE
+               XSC = ONE / DNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL DSCAL( N, XSC, V(1,q), 1 )
+ 1972       CONTINUE
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+            IF ( NR .LT. M ) THEN
+               CALL DLASET( 'A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU )
+               IF ( NR .LT. N1 ) THEN
+                  CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                  CALL DLASET('A',M-NR,N1-NR,ZERO,ONE,U(NR+1,NR+1),LDU)
+               END IF
+            END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           matrix U. This applies to all cases.
+*
+            CALL DORMQR( 'Left', 'No_Tr', M, N1, N, A, LDA, WORK, U,
+     $           LDU, WORK(N+1), LWORK-N, IERR )
+
+*           The columns of U are normalized. The cost is O(M*N) flops.
+            TEMP1 = DSQRT(DBLE(M)) * EPSLN
+            DO 1973 p = 1, NR
+               XSC = ONE / DNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $          CALL DSCAL( M, XSC, U(1,p), 1 )
+ 1973       CONTINUE
+*
+*           If the initial QRF is computed with row pivoting, the left
+*           singular vectors must be adjusted.
+*
+            IF ( ROWPIV )
+     $          CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+         ELSE
+*
+*        .. the initial matrix A has almost orthogonal columns and
+*        the second QRF is not needed
+*
+            CALL DLACPY( 'Upper', N, N, A, LDA, WORK(N+1), N )
+            IF ( L2PERT ) THEN
+               XSC = DSQRT(SMALL)
+               DO 5970 p = 2, N
+                  TEMP1 = XSC * WORK( N + (p-1)*N + p )
+                  DO 5971 q = 1, p - 1
+                     WORK(N+(q-1)*N+p)=-DSIGN(TEMP1,WORK(N+(p-1)*N+q))
+ 5971             CONTINUE
+ 5970          CONTINUE
+            ELSE
+               CALL DLASET( 'Lower',N-1,N-1,ZERO,ZERO,WORK(N+2),N )
+            END IF
+*
+            CALL DGESVJ( 'Upper', 'U', 'N', N, N, WORK(N+1), N, SVA,
+     $           N, U, LDU, WORK(N+N*N+1), LWORK-N-N*N, INFO )
+*
+            SCALEM  = WORK(N+N*N+1)
+            NUMRANK = IDNINT(WORK(N+N*N+2))
+            DO 6970 p = 1, N
+               CALL DCOPY( N, WORK(N+(p-1)*N+1), 1, U(1,p), 1 )
+               CALL DSCAL( N, SVA(p), WORK(N+(p-1)*N+1), 1 )
+ 6970       CONTINUE
+*
+            CALL DTRSM( 'Left', 'Upper', 'NoTrans', 'No UD', N, N,
+     $           ONE, A, LDA, WORK(N+1), N )
+            DO 6972 p = 1, N
+               CALL DCOPY( N, WORK(N+p), N, V(IWORK(p),1), LDV )
+ 6972       CONTINUE
+            TEMP1 = DSQRT(DBLE(N))*EPSLN
+            DO 6971 p = 1, N
+               XSC = ONE / DNRM2( N, V(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL DSCAL( N, XSC, V(1,p), 1 )
+ 6971       CONTINUE
+*
+*           Assemble the left singular vector matrix U (M x N).
+*
+            IF ( N .LT. M ) THEN
+               CALL DLASET( 'A',  M-N, N, ZERO, ZERO, U(N+1,1), LDU )
+               IF ( N .LT. N1 ) THEN
+                  CALL DLASET( 'A',N,  N1-N, ZERO, ZERO,  U(1,N+1),LDU )
+                  CALL DLASET( 'A',M-N,N1-N, ZERO, ONE,U(N+1,N+1),LDU )
+               END IF
+            END IF
+            CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U,
+     $           LDU, WORK(N+1), LWORK-N, IERR )
+            TEMP1 = DSQRT(DBLE(M))*EPSLN
+            DO 6973 p = 1, N1
+               XSC = ONE / DNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL DSCAL( M, XSC, U(1,p), 1 )
+ 6973       CONTINUE
+*
+            IF ( ROWPIV )
+     $         CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+         END IF
+*
+*        end of the  >> almost orthogonal case <<  in the full SVD
+*
+         ELSE
+*
+*        This branch deploys a preconditioned Jacobi SVD with explicitly
+*        accumulated rotations. It is included as optional, mainly for
+*        experimental purposes. It does perform well, and can also be used.
+*        In this implementation, this branch will be automatically activated
+*        if the  condition number sigma_max(A) / sigma_min(A) is predicted
+*        to be greater than the overflow threshold. This is because the
+*        a posteriori computation of the singular vectors assumes robust
+*        implementation of BLAS and some LAPACK procedures, capable of working
+*        in presence of extreme values. Since that is not always the case, ...
+*
+         DO 7968 p = 1, NR
+            CALL DCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+ 7968    CONTINUE
+*
+         IF ( L2PERT ) THEN
+            XSC = DSQRT(SMALL/EPSLN)
+            DO 5969 q = 1, NR
+               TEMP1 = XSC*DABS( V(q,q) )
+               DO 5968 p = 1, N
+                  IF ( ( p .GT. q ) .AND. ( DABS(V(p,q)) .LE. TEMP1 )
+     $                .OR. ( p .LT. q ) )
+     $                V(p,q) = DSIGN( TEMP1, V(p,q) )
+                  IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 5968          CONTINUE
+ 5969       CONTINUE
+         ELSE
+            CALL DLASET( 'U', NR-1, NR-1, ZERO, ZERO, V(1,2), LDV )
+         END IF
+
+         CALL DGEQRF( N, NR, V, LDV, WORK(N+1), WORK(2*N+1),
+     $        LWORK-2*N, IERR )
+         CALL DLACPY( 'L', N, NR, V, LDV, WORK(2*N+1), N )
+*
+         DO 7969 p = 1, NR
+            CALL DCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
+ 7969    CONTINUE
+
+         IF ( L2PERT ) THEN
+            XSC = DSQRT(SMALL/EPSLN)
+            DO 9970 q = 2, NR
+               DO 9971 p = 1, q - 1
+                  TEMP1 = XSC * MIN(DABS(U(p,p)),DABS(U(q,q)))
+                  U(p,q) = - DSIGN( TEMP1, U(q,p) )
+ 9971          CONTINUE
+ 9970       CONTINUE
+         ELSE
+            CALL DLASET('U', NR-1, NR-1, ZERO, ZERO, U(1,2), LDU )
+         END IF
+
+         CALL DGESVJ( 'G', 'U', 'V', NR, NR, U, LDU, SVA,
+     $        N, V, LDV, WORK(2*N+N*NR+1), LWORK-2*N-N*NR, INFO )
+         SCALEM  = WORK(2*N+N*NR+1)
+         NUMRANK = IDNINT(WORK(2*N+N*NR+2))
+
+         IF ( NR .LT. N ) THEN
+            CALL DLASET( 'A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV )
+            CALL DLASET( 'A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV )
+            CALL DLASET( 'A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV )
+         END IF
+
+         CALL DORMQR( 'L','N',N,N,NR,WORK(2*N+1),N,WORK(N+1),
+     $        V,LDV,WORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = DSQRT(DBLE(N)) * EPSLN
+            DO 7972 q = 1, N
+               DO 8972 p = 1, N
+                  WORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+ 8972          CONTINUE
+               DO 8973 p = 1, N
+                  V(p,q) = WORK(2*N+N*NR+NR+p)
+ 8973          CONTINUE
+               XSC = ONE / DNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL DSCAL( N, XSC, V(1,q), 1 )
+ 7972       CONTINUE
+*
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+*
+         IF ( NR .LT. M ) THEN
+            CALL DLASET( 'A',  M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL DLASET( 'A',NR,  N1-NR, ZERO, ZERO,  U(1,NR+1),LDU )
+               CALL DLASET( 'A',M-NR,N1-NR, ZERO, ONE,U(NR+1,NR+1),LDU )
+            END IF
+         END IF
+*
+         CALL DORMQR( 'Left', 'No Tr', M, N1, N, A, LDA, WORK, U,
+     $        LDU, WORK(N+1), LWORK-N, IERR )
+*
+            IF ( ROWPIV )
+     $         CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(2*N+1), -1 )
+*
+*
+         END IF
+         IF ( TRANSP ) THEN
+*           .. swap U and V because the procedure worked on A^t
+            DO 6974 p = 1, N
+               CALL DSWAP( N, U(1,p), 1, V(1,p), 1 )
+ 6974       CONTINUE
+         END IF
+*
+      END IF
+*     end of the full SVD
+*
+*     Undo scaling, if necessary (and possible)
+*
+      IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN
+         CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
+         USCAL1 = ONE
+         USCAL2 = ONE
+      END IF
+*
+      IF ( NR .LT. N ) THEN
+         DO 3004 p = NR+1, N
+            SVA(p) = ZERO
+ 3004    CONTINUE
+      END IF
+*
+      WORK(1) = USCAL2 * SCALEM
+      WORK(2) = USCAL1
+      IF ( ERREST ) WORK(3) = SCONDA
+      IF ( LSVEC .AND. RSVEC ) THEN
+         WORK(4) = CONDR1
+         WORK(5) = CONDR2
+      END IF
+      IF ( L2TRAN ) THEN
+         WORK(6) = ENTRA
+         WORK(7) = ENTRAT
+      END IF
+*
+      IWORK(1) = NR
+      IWORK(2) = NUMRANK
+      IWORK(3) = WARNING
+*
+      RETURN
+*     ..
+*     .. END OF DGEJSV
+*     ..
+      END
+*
diff --git a/lapack-netlib/dgesvx.f b/lapack-netlib/dgesvx.f
new file mode 100644
index 000000000..f787488dc
--- /dev/null
+++ b/lapack-netlib/dgesvx.f
@@ -0,0 +1,599 @@
+*> \brief <b> DGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       DOUBLE PRECISION   A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGESVX uses the LU factorization to compute the solution to a real
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is DOUBLE PRECISION array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by DGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by DGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is DOUBLE PRECISION array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is DOUBLE PRECISION array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,4*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup doubleGEsolve
+*
+*  =====================================================================
+      SUBROUTINE DGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      DOUBLE PRECISION   A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, DLANGE, DLANTR
+      EXTERNAL           LSAME, DLAMCH, DLANGE, DLANTR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGECON, DGEEQU, DGERFS, DGETRF, DGETRS, DLACPY,
+     $                   DLAQGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL DGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL DLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL DLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL DGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = DLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = DLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = DLANGE( NORM, N, N, A, LDA, WORK )
+      RPVGRW = DLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = DLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL DGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL DLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL DGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL DGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+      WORK( 1 ) = RPVGRW
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+      RETURN
+*
+*     End of DGESVX
+*
+      END
diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py
index 5582744a0..fea0ff765 100755
--- a/lapack-netlib/lapack_testing.py
+++ b/lapack-netlib/lapack_testing.py
@@ -1,31 +1,29 @@
-#! /usr/bin/env python
-# -*- coding: utf-8 -*-
+#!/usr/bin/env python3
 
 
 ###############################################################################
 # lapack_testing.py
 ###############################################################################
 
-from __future__ import print_function
 from subprocess import Popen, STDOUT, PIPE
 import os, sys, math
 import getopt
 # Arguments
 try:
     opts, args = getopt.getopt(sys.argv[1:], "hd:b:srep:t:n",
-                               ["help", "dir", "bin", "short", "run", "error","prec=","test=","number"])
+                               ["help", "dir=", "bin=", "short", "run", "error","prec=","test=","number"])
 
 except getopt.error as msg:
     print(msg)
     print("for help use --help")
     sys.exit(2)
 
-short_summary=0
-with_file=1
-just_errors = 0
+short_summary = False
+with_file = True
+just_errors = False
 prec='x'
 test='all'
-only_numbers=0
+only_numbers = False
 test_dir='TESTING'
 bin_dir='bin/Release'
 
@@ -34,10 +32,9 @@ for o, a in opts:
         print(sys.argv[0]+" [-h|--help] [-d dir |--dir dir] [-s |--short] [-r |--run] [-e |--error] [-p p |--prec p] [-t test |--test test] [-n | --number]")
         print("     - h is to print this message")
         print("     - r is to use to run the LAPACK tests then analyse the output (.out files). By default, the script will not run all the LAPACK tests")
-        print("     - d [dir] is to indicate where is the LAPACK testing directory (.out files). By default, the script will use .")
-        print("     - b [bin] is to indicate where is the LAPACK binary files are located. By default, the script will use .")
+        print("     - d [dir] indicates the location of the LAPACK testing directory (.out files). By default, the script will use {:s}.".format(test_dir))
+        print("     - b [bin] indicates the location of the LAPACK binary files. By default, the script will use {:s}.".format(bin_dir))
         print(" LEVEL OF OUTPUT")
-        print("     - x is to print a detailed summary")
         print("     - e is to print only the error summary")
         print("     - s is to print a short summary")
         print("     - n is to print the numbers of failing tests (turn on summary mode)")
@@ -63,15 +60,14 @@ for o, a in opts:
         print("            Will return the numbers of failed tests in REAL precision by running the LAPACK Tests then analyzing the output")
         print("     ./lapack_testing.py -n -p s -t eig ")
         print("            Will return the numbers of failed tests in REAL precision by analyzing only the LAPACK output of EIGEN testings")
-        print("Written by Julie Langou (June 2011) ")
         sys.exit(0)
     else:
         if o in ("-s", "--short"):
-            short_summary = 1
+            short_summary = True
         if o in ("-r", "--run"):
-            with_file = 0
+            with_file = False
         if o in ("-e", "--error"):
-            just_errors = 1
+            just_errors = True
         if o in ( '-p', '--prec' ):
             prec = a
         if o in ( '-b', '--bin' ):
@@ -81,12 +77,12 @@ for o, a in opts:
         if o in ( '-t', '--test' ):
             test = a
         if o in ( '-n', '--number' ):
-            only_numbers = 1
-            short_summary = 1
+            only_numbers = True
+            short_summary = True
 
 # process options
 
-abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir))
+abs_bin_dir=os.path.abspath(bin_dir)
 
 os.chdir(test_dir)
 
@@ -108,7 +104,7 @@ def run_summary_test( f, cmdline, short_summary):
     nb_test_illegal=0
     nb_test_info=0
 
-    if (with_file):
+    if with_file:
         if not os.path.exists(cmdline):
             error_message=cmdline+" file not found"
             r=1
@@ -140,21 +136,21 @@ def run_summary_test( f, cmdline, short_summary):
         for line in pipe.readlines():
             f.write(str(line))
             words_in_line=line.split()
-            if (line.find("run")!=-1):
+            if (line.find("run)")!=-1):
 #                  print line
                 whereisrun=words_in_line.index("run)")
                 nb_test_run+=int(words_in_line[whereisrun-2])
             if (line.find("out of")!=-1):
-                if (short_summary==0): print(line, end=' ')
+                if not short_summary: print(line, end=' ')
                 whereisout= words_in_line.index("out")
                 nb_test_fail+=int(words_in_line[whereisout-1])
             if ((line.find("illegal")!=-1) or (line.find("Illegal")!=-1)):
-                if (short_summary==0):print(line, end=' ')
+                if not short_summary: print(line, end=' ')
                 nb_test_illegal+=1
             if (line.find(" INFO")!=-1):
-                if (short_summary==0):print(line, end=' ')
+                if not short_summary: print(line, end=' ')
                 nb_test_info+=1
-            if (with_file==1):
+            if with_file:
                 pipe.close()
 
     f.flush();
@@ -169,7 +165,7 @@ try:
 except IOError:
     f = sys.stdout
 
-if (short_summary==0):
+if not short_summary:
     print(" ")
     print("---------------- Testing LAPACK Routines ----------------")
     print(" ")
@@ -203,6 +199,8 @@ elif test=='mixed':
     range_prec=[1,3]
 elif test=='rfp':
     range_test=[18]
+elif test=='dmd':
+    range_test=[20]
 elif test=='eig':
     range_test=list(range(16))
 else:
@@ -219,7 +217,7 @@ for dtype in range_prec:
     letter = dtypes[0][dtype]
     name = dtypes[1][dtype]
 
-    if (short_summary==0):
+    if not short_summary:
         print(" ")
         print("------------------------- %s ------------------------" % name)
         print(" ")
@@ -231,19 +229,19 @@ for dtype in range_prec:
     letter+"gd",letter+"sb",letter+"sg",
     letter+"bb","glm","gqr",
     "gsv","csd","lse",
-    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"),
+    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"),
     ("Nonsymmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Problem-2-stage", "Singular-Value-Decomposition",
     "Eigen-Condition","Nonsymmetric-Eigenvalue","Nonsymmetric-Generalized-Eigenvalue-Problem",
     "Nonsymmetric-Generalized-Eigenvalue-Problem-driver", "Symmetric-Eigenvalue-Problem", "Symmetric-Eigenvalue-Generalized-Problem",
     "Banded-Singular-Value-Decomposition-routines", "Generalized-Linear-Regression-Model-routines", "Generalized-QR-and-RQ-factorization-routines",
     "Generalized-Singular-Value-Decomposition-routines", "CS-Decomposition-routines", "Constrained-Linear-Least-Squares-routines",
-    "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines"),
+    "Linear-Equation-routines", "Mixed-Precision-linear-equation-routines","RFP-linear-equation-routines","Dynamic-Mode-Decomposition"),
     (letter+"nep", letter+"sep", letter+"se2", letter+"svd",
     letter+"ec",letter+"ed",letter+"gg",
     letter+"gd",letter+"sb",letter+"sg",
     letter+"bb",letter+"glm",letter+"gqr",
     letter+"gsv",letter+"csd",letter+"lse",
-    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp"),
+    letter+"test", letter+dtypes[0][dtype-1]+"test",letter+"test_rfp",letter+"dmd"),
     )
 
 
@@ -252,7 +250,7 @@ for dtype in range_prec:
         # NEED TO SKIP SOME PRECISION (namely s and c) FOR PROTO MIXED PRECISION TESTING
         if dtest==17 and (letter=="s" or letter=="c"):
             continue
-        if (with_file==1):
+        if with_file:
             cmdbase=dtests[2][dtest]+".out"
         else:
             if dtest==16:
@@ -264,10 +262,13 @@ for dtype in range_prec:
             elif dtest==18:
                 # PROTO LIN TESTS
                 cmdbase="LIN/xlintstrf"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
+            elif dtest==20:
+                # DMD EIG TESTS
+                cmdbase="EIG/xdmdeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
             else:
                 # EIG TESTS
                 cmdbase="EIG/xeigtst"+letter+" < "+dtests[0][dtest]+".in > "+dtests[2][dtest]+".out"
-        if (not just_errors and not short_summary):
+        if not just_errors and not short_summary:
             print("Testing "+name+" "+dtests[1][dtest]+"-"+cmdbase, end=' ')
         # Run the process: either to read the file or run the LAPACK testing
         nb_test = run_summary_test(f, cmdbase, short_summary)
@@ -277,19 +278,19 @@ for dtype in range_prec:
         list_results[3][dtype]+=nb_test[3]
         got_error=nb_test[1]+nb_test[2]+nb_test[3]
 
-        if (not short_summary):
-            if (nb_test[0]>0 and just_errors==0):
+        if not short_summary:
+            if nb_test[0] > 0 and not just_errors:
                 print("passed: "+str(nb_test[0]))
-            if (nb_test[1]>0):
+            if nb_test[1] > 0:
                 print("failing to pass the threshold: "+str(nb_test[1]))
-            if (nb_test[2]>0):
+            if nb_test[2] > 0:
                 print("Illegal Error: "+str(nb_test[2]))
-            if (nb_test[3]>0):
+            if nb_test[3] > 0:
                 print("Info Error: "+str(nb_test[3]))
-            if (got_error>0 and just_errors==1):
+            if got_error > 0 and just_errors:
                 print("ERROR IS LOCATED IN "+name+" "+dtests[1][dtest]+" [ "+cmdbase+" ]")
                 print("")
-            if (just_errors==0):
+            if not just_errors:
                 print("")
 #     elif (got_error>0):
 #        print dtests[2][dtest]+".out \t"+str(nb_test[1])+"\t"+str(nb_test[2])+"\t"+str(nb_test[3])
@@ -307,7 +308,7 @@ for dtype in range_prec:
     list_results[2][4]+=list_results[2][dtype]
     list_results[3][4]+=list_results[3][dtype]
 
-if only_numbers==1:
+if only_numbers:
     print(str(list_results[1][4])+"\n"+str(list_results[2][4]+list_results[3][4]))
 else:
     print(summary)
diff --git a/lapack-netlib/sgbsvx.f b/lapack-netlib/sgbsvx.f
new file mode 100644
index 000000000..df3a721d9
--- /dev/null
+++ b/lapack-netlib/sgbsvx.f
@@ -0,0 +1,641 @@
+*> \brief <b> SGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       REAL               AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGBSVX uses the LU factorization to compute the solution to a real
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is REAL array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is REAL array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by SGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by SGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is REAL array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,3*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup realGBsolve
+*
+*  =====================================================================
+      SUBROUTINE SGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      REAL               AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*  Moved setting of INFO = N+1 so INFO does not subsequently get
+*  overwritten.  Sven, 17 Mar 05.
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               SLAMCH, SLANGB, SLANTB
+      EXTERNAL           LSAME, SLAMCH, SLANGB, SLANTB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SGBCON, SGBEQU, SGBRFS, SGBTRF, SGBTRS,
+     $                   SLACPY, SLAQGB, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL SGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL SLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL SCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL SGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = SLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = SLANGB( NORM, N, KL, KU, AB, LDAB, WORK )
+      RPVGRW = SLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = SLANGB( 'M', N, KL, KU, AB, LDAB, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL SGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL SGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL SGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      WORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of SGBSVX
+*
+      END
diff --git a/lapack-netlib/sgesvx.f b/lapack-netlib/sgesvx.f
new file mode 100644
index 000000000..385e626cf
--- /dev/null
+++ b/lapack-netlib/sgesvx.f
@@ -0,0 +1,599 @@
+*> \brief <b> SGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, IWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       REAL               RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * ), IWORK( * )
+*       REAL               A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGESVX uses the LU factorization to compute the solution to a real
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is REAL array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by SGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by SGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is REAL array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is REAL array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is REAL array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is REAL array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is REAL
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is REAL array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is REAL array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (MAX(1,4*N))
+*>          On exit, WORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If WORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          WORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (N)
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup realGEsolve
+*
+*  =====================================================================
+      SUBROUTINE SGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, IWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      REAL               RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * ), IWORK( * )
+      REAL               A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO, ONE
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      REAL               AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      REAL               SLAMCH, SLANGE, SLANTR
+      EXTERNAL           LSAME, SLAMCH, SLANGE, SLANTR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGECON, SGEEQU, SGERFS, SGETRF, SGETRS, SLACPY,
+     $                   SLAQGE, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = SLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL SGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL SLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL SLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL SGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = SLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               WORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = SLANGE( 'M', N, INFO, A, LDA, WORK ) / RPVGRW
+            END IF
+            WORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = SLANGE( NORM, N, N, A, LDA, WORK )
+      RPVGRW = SLANTR( 'M', 'U', 'N', N, N, AF, LDAF, WORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = SLANGE( 'M', N, N, A, LDA, WORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL SGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, IWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL SLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL SGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL SGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, IWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.SLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      WORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of SGESVX
+*
+      END
diff --git a/lapack-netlib/zgbsvx.f b/lapack-netlib/zgbsvx.f
new file mode 100644
index 000000000..871564a81
--- /dev/null
+++ b/lapack-netlib/zgbsvx.f
@@ -0,0 +1,644 @@
+*> \brief <b> ZGBSVX computes the solution to system of linear equations A * X = B for GB matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGBSVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgbsvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgbsvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgbsvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+*                          LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+*                          RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX*16         AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGBSVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations A * X = B, A**T * X = B, or A**H * X = B,
+*> where A is a band matrix of order N with KL subdiagonals and KU
+*> superdiagonals, and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed by this subroutine:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = L * U,
+*>    where L is a product of permutation and unit lower triangular
+*>    matrices with KL subdiagonals, and U is upper triangular with
+*>    KL+KU superdiagonals.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AFB and IPIV contain the factored form of
+*>                  A.  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  AB, AFB, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AFB and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AFB and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations.
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] KL
+*> \verbatim
+*>          KL is INTEGER
+*>          The number of subdiagonals within the band of A.  KL >= 0.
+*> \endverbatim
+*>
+*> \param[in] KU
+*> \verbatim
+*>          KU is INTEGER
+*>          The number of superdiagonals within the band of A.  KU >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] AB
+*> \verbatim
+*>          AB is COMPLEX*16 array, dimension (LDAB,N)
+*>          On entry, the matrix A in band storage, in rows 1 to KL+KU+1.
+*>          The j-th column of A is stored in the j-th column of the
+*>          array AB as follows:
+*>          AB(KU+1+i-j,j) = A(i,j) for max(1,j-KU)<=i<=min(N,j+kl)
+*>
+*>          If FACT = 'F' and EQUED is not 'N', then A must have been
+*>          equilibrated by the scaling factors in R and/or C.  AB is not
+*>          modified if FACT = 'F' or 'N', or if FACT = 'E' and
+*>          EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDAB
+*> \verbatim
+*>          LDAB is INTEGER
+*>          The leading dimension of the array AB.  LDAB >= KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] AFB
+*> \verbatim
+*>          AFB is COMPLEX*16 array, dimension (LDAFB,N)
+*>          If FACT = 'F', then AFB is an input argument and on entry
+*>          contains details of the LU factorization of the band matrix
+*>          A, as computed by ZGBTRF.  U is stored as an upper triangular
+*>          band matrix with KL+KU superdiagonals in rows 1 to KL+KU+1,
+*>          and the multipliers used during the factorization are stored
+*>          in rows KL+KU+2 to 2*KL+KU+1.  If EQUED .ne. 'N', then AFB is
+*>          the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of A.
+*>
+*>          If FACT = 'E', then AFB is an output argument and on exit
+*>          returns details of the LU factorization of the equilibrated
+*>          matrix A (see the description of AB for the form of the
+*>          equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAFB
+*> \verbatim
+*>          LDAFB is INTEGER
+*>          The leading dimension of the array AFB.  LDAFB >= 2*KL+KU+1.
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = L*U
+*>          as computed by ZGBTRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,NRHS)
+*>          On entry, the right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization
+*>                       has been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GBsolve
+*
+*  =====================================================================
+      SUBROUTINE ZGBSVX( FACT, TRANS, N, KL, KU, NRHS, AB, LDAB, AFB,
+     $                   LDAFB, IPIV, EQUED, R, C, B, LDB, X, LDX,
+     $                   RCOND, FERR, BERR, WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, KL, KU, LDAB, LDAFB, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX*16         AB( LDAB, * ), AFB( LDAFB, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*  Moved setting of INFO = N+1 so INFO does not subsequently get
+*  overwritten.  Sven, 17 Mar 05.
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J, J1, J2
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, ZLANGB, ZLANTB
+      EXTERNAL           LSAME, DLAMCH, ZLANGB, ZLANTB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZCOPY, ZGBCON, ZGBEQU, ZGBRFS, ZGBTRF,
+     $                   ZGBTRS, ZLACPY, ZLAQGB
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( KL.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( KU.LT.0 ) THEN
+         INFO = -5
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -6
+      ELSE IF( LDAB.LT.KL+KU+1 ) THEN
+         INFO = -8
+      ELSE IF( LDAFB.LT.2*KL+KU+1 ) THEN
+         INFO = -10
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -12
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -13
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -14
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -18
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGBSVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL ZGBEQU( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL ZLAQGB( N, N, KL, KU, AB, LDAB, R, C, ROWCND, COLCND,
+     $                   AMAX, EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of the band matrix A.
+*
+         DO 70 J = 1, N
+            J1 = MAX( J-KU, 1 )
+            J2 = MIN( J+KL, N )
+            CALL ZCOPY( J2-J1+1, AB( KU+1-J+J1, J ), 1,
+     $                  AFB( KL+KU+1-J+J1, J ), 1 )
+   70    CONTINUE
+*
+         CALL ZGBTRF( N, N, KL, KU, AFB, LDAFB, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            ANORM = ZERO
+            DO 90 J = 1, INFO
+               DO 80 I = MAX( KU+2-J, 1 ), MIN( N+KU+1-J, KL+KU+1 )
+                  ANORM = MAX( ANORM, ABS( AB( I, J ) ) )
+   80          CONTINUE
+   90       CONTINUE
+            RPVGRW = ZLANTB( 'M', 'U', 'N', INFO, MIN( INFO-1, KL+KU ),
+     $                       AFB( MAX( 1, KL+KU+2-INFO ), 1 ), LDAFB,
+     $                       RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ANORM / RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = ZLANGB( NORM, N, KL, KU, AB, LDAB, RWORK )
+      RPVGRW = ZLANTB( 'M', 'U', 'N', N, KL+KU, AFB, LDAFB, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = ZLANGB( 'M', N, KL, KU, AB, LDAB, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL ZGBCON( NORM, N, KL, KU, AFB, LDAFB, IPIV, ANORM, RCOND,
+     $             WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL ZGBTRS( TRANS, N, KL, KU, NRHS, AFB, LDAFB, IPIV, X, LDX,
+     $             INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL ZGBRFS( TRANS, N, KL, KU, NRHS, AB, LDAB, AFB, LDAFB, IPIV,
+     $             B, LDB, X, LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 110 J = 1, NRHS
+               DO 100 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+  100          CONTINUE
+  110       CONTINUE
+            DO 120 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+  120       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 140 J = 1, NRHS
+            DO 130 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  130       CONTINUE
+  140    CONTINUE
+         DO 150 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  150    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of ZGBSVX
+*
+      END
diff --git a/lapack-netlib/zgejsv.f b/lapack-netlib/zgejsv.f
new file mode 100644
index 000000000..5fe899e50
--- /dev/null
+++ b/lapack-netlib/zgejsv.f
@@ -0,0 +1,2234 @@
+*> \brief \b ZGEJSV
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGEJSV + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgejsv.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgejsv.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgejsv.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*     SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+*                         M, N, A, LDA, SVA, U, LDU, V, LDV,
+*                         CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*     IMPLICIT    NONE
+*     INTEGER     INFO, LDA, LDU, LDV, LWORK, M, N
+*     ..
+*     .. Array Arguments ..
+*     COMPLEX*16     A( LDA, * ),  U( LDU, * ), V( LDV, * ), CWORK( LWORK )
+*     DOUBLE PRECISION   SVA( N ), RWORK( LRWORK )
+*     INTEGER     IWORK( * )
+*     CHARACTER*1 JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGEJSV computes the singular value decomposition (SVD) of a complex M-by-N
+*> matrix [A], where M >= N. The SVD of [A] is written as
+*>
+*>              [A] = [U] * [SIGMA] * [V]^*,
+*>
+*> where [SIGMA] is an N-by-N (M-by-N) matrix which is zero except for its N
+*> diagonal elements, [U] is an M-by-N (or M-by-M) unitary matrix, and
+*> [V] is an N-by-N unitary matrix. The diagonal elements of [SIGMA] are
+*> the singular values of [A]. The columns of [U] and [V] are the left and
+*> the right singular vectors of [A], respectively. The matrices [U] and [V]
+*> are computed and stored in the arrays U and V, respectively. The diagonal
+*> of [SIGMA] is computed and stored in the array SVA.
+*> \endverbatim
+*>
+*>  Arguments:
+*>  ==========
+*>
+*> \param[in] JOBA
+*> \verbatim
+*>          JOBA is CHARACTER*1
+*>         Specifies the level of accuracy:
+*>       = 'C': This option works well (high relative accuracy) if A = B * D,
+*>              with well-conditioned B and arbitrary diagonal matrix D.
+*>              The accuracy cannot be spoiled by COLUMN scaling. The
+*>              accuracy of the computed output depends on the condition of
+*>              B, and the procedure aims at the best theoretical accuracy.
+*>              The relative error max_{i=1:N}|d sigma_i| / sigma_i is
+*>              bounded by f(M,N)*epsilon* cond(B), independent of D.
+*>              The input matrix is preprocessed with the QRF with column
+*>              pivoting. This initial preprocessing and preconditioning by
+*>              a rank revealing QR factorization is common for all values of
+*>              JOBA. Additional actions are specified as follows:
+*>       = 'E': Computation as with 'C' with an additional estimate of the
+*>              condition number of B. It provides a realistic error bound.
+*>       = 'F': If A = D1 * C * D2 with ill-conditioned diagonal scalings
+*>              D1, D2, and well-conditioned matrix C, this option gives
+*>              higher accuracy than the 'C' option. If the structure of the
+*>              input matrix is not known, and relative accuracy is
+*>              desirable, then this option is advisable. The input matrix A
+*>              is preprocessed with QR factorization with FULL (row and
+*>              column) pivoting.
+*>       = 'G': Computation as with 'F' with an additional estimate of the
+*>              condition number of B, where A=B*D. If A has heavily weighted
+*>              rows, then using this condition number gives too pessimistic
+*>              error bound.
+*>       = 'A': Small singular values are not well determined by the data 
+*>              and are considered as noisy; the matrix is treated as
+*>              numerically rank deficient. The error in the computed
+*>              singular values is bounded by f(m,n)*epsilon*||A||.
+*>              The computed SVD A = U * S * V^* restores A up to
+*>              f(m,n)*epsilon*||A||.
+*>              This gives the procedure the licence to discard (set to zero)
+*>              all singular values below N*epsilon*||A||.
+*>       = 'R': Similar as in 'A'. Rank revealing property of the initial
+*>              QR factorization is used do reveal (using triangular factor)
+*>              a gap sigma_{r+1} < epsilon * sigma_r in which case the
+*>              numerical RANK is declared to be r. The SVD is computed with
+*>              absolute error bounds, but more accurately than with 'A'.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>         Specifies whether to compute the columns of U:
+*>       = 'U': N columns of U are returned in the array U.
+*>       = 'F': full set of M left sing. vectors is returned in the array U.
+*>       = 'W': U may be used as workspace of length M*N. See the description
+*>              of U.
+*>       = 'N': U is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>         Specifies whether to compute the matrix V:
+*>       = 'V': N columns of V are returned in the array V; Jacobi rotations
+*>              are not explicitly accumulated.
+*>       = 'J': N columns of V are returned in the array V, but they are
+*>              computed as the product of Jacobi rotations, if JOBT = 'N'.
+*>       = 'W': V may be used as workspace of length N*N. See the description
+*>              of V.
+*>       = 'N': V is not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>         Specifies the RANGE for the singular values. Issues the licence to
+*>         set to zero small positive singular values if they are outside
+*>         specified range. If A .NE. 0 is scaled so that the largest singular
+*>         value of c*A is around SQRT(BIG), BIG=DLAMCH('O'), then JOBR issues
+*>         the licence to kill columns of A whose norm in c*A is less than
+*>         SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
+*>         where SFMIN=DLAMCH('S'), EPSLN=DLAMCH('E').
+*>       = 'N': Do not kill small columns of c*A. This option assumes that
+*>              BLAS and QR factorizations and triangular solvers are
+*>              implemented to work in that range. If the condition of A
+*>              is greater than BIG, use ZGESVJ.
+*>       = 'R': RESTRICTED range for sigma(c*A) is [SQRT(SFMIN), SQRT(BIG)]
+*>              (roughly, as described above). This option is recommended.
+*>                                             ===========================
+*>         For computing the singular values in the FULL range [SFMIN,BIG]
+*>         use ZGESVJ.
+*> \endverbatim
+*>
+*> \param[in] JOBT
+*> \verbatim
+*>          JOBT is CHARACTER*1
+*>         If the matrix is square then the procedure may determine to use
+*>         transposed A if A^* seems to be better with respect to convergence.
+*>         If the matrix is not square, JOBT is ignored. 
+*>         The decision is based on two values of entropy over the adjoint
+*>         orbit of A^* * A. See the descriptions of RWORK(6) and RWORK(7).
+*>       = 'T': transpose if entropy test indicates possibly faster
+*>         convergence of Jacobi process if A^* is taken as input. If A is
+*>         replaced with A^*, then the row pivoting is included automatically.
+*>       = 'N': do not speculate.
+*>         The option 'T' can be used to compute only the singular values, or
+*>         the full SVD (U, SIGMA and V). For only one set of singular vectors
+*>         (U or V), the caller should provide both U and V, as one of the
+*>         matrices is used as workspace if the matrix A is transposed.
+*>         The implementer can easily remove this constraint and make the
+*>         code more complicated. See the descriptions of U and V.
+*>         In general, this option is considered experimental, and 'N'; should
+*>         be preferred. This is subject to changes in the future.
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>          JOBP is CHARACTER*1
+*>         Issues the licence to introduce structured perturbations to drown
+*>         denormalized numbers. This licence should be active if the
+*>         denormals are poorly implemented, causing slow computation,
+*>         especially in cases of fast convergence (!). For details see [1,2].
+*>         For the sake of simplicity, this perturbations are included only
+*>         when the full SVD or only the singular values are requested. The
+*>         implementer/user can easily add the perturbation for the cases of
+*>         computing one set of singular vectors.
+*>       = 'P': introduce perturbation
+*>       = 'N': do not perturb
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>         The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>         The number of columns of the input matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix A.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] SVA
+*> \verbatim
+*>          SVA is DOUBLE PRECISION array, dimension (N)
+*>          On exit,
+*>          - For RWORK(1)/RWORK(2) = ONE: The singular values of A. During
+*>            the computation SVA contains Euclidean column norms of the
+*>            iterated matrices in the array A.
+*>          - For RWORK(1) .NE. RWORK(2): The singular values of A are
+*>            (RWORK(1)/RWORK(2)) * SVA(1:N). This factored form is used if
+*>            sigma_max(A) overflows or if small singular values have been
+*>            saved from underflow by scaling the input matrix A.
+*>          - If JOBR='R' then some of the singular values may be returned
+*>            as exact zeros obtained by "set to zero" because they are
+*>            below the numerical rank threshold or are denormalized numbers.
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is COMPLEX*16 array, dimension ( LDU, N )
+*>          If JOBU = 'U', then U contains on exit the M-by-N matrix of
+*>                         the left singular vectors.
+*>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
+*>                         the left singular vectors, including an ONB
+*>                         of the orthogonal complement of the Range(A).
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
+*>                         then U is used as workspace if the procedure
+*>                         replaces A with A^*. In that case, [V] is computed
+*>                         in U as left singular vectors of A^* and then
+*>                         copied back to the V array. This 'W' option is just
+*>                         a reminder to the caller that in this case U is
+*>                         reserved as workspace of length N*N.
+*>          If JOBU = 'N'  U is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER
+*>          The leading dimension of the array U,  LDU >= 1.
+*>          IF  JOBU = 'U' or 'F' or 'W',  then LDU >= M.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension ( LDV, N )
+*>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
+*>                         the right singular vectors;
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
+*>                         then V is used as workspace if the pprocedure
+*>                         replaces A with A^*. In that case, [U] is computed
+*>                         in V as right singular vectors of A^* and then
+*>                         copied back to the U array. This 'W' option is just
+*>                         a reminder to the caller that in this case V is
+*>                         reserved as workspace of length N*N.
+*>          If JOBV = 'N'  V is not referenced, unless JOBT='T'.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V,  LDV >= 1.
+*>          If JOBV = 'V' or 'J' or 'W', then LDV >= N.
+*> \endverbatim
+*>
+*> \param[out] CWORK
+*> \verbatim
+*>          CWORK is COMPLEX*16 array, dimension (MAX(2,LWORK))
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the required length of
+*>          CWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          Length of CWORK to confirm proper allocation of workspace.
+*>          LWORK depends on the job:
+*>
+*>          1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and
+*>            1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'):
+*>               LWORK >= 2*N+1. This is the minimal requirement.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= N + (N+1)*NB. Here NB is the optimal
+*>               block size for ZGEQP3 and ZGEQRF.
+*>               In general, optimal LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ)).
+*>            1.2. .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G'). In this case, LWORK the minimal
+*>               requirement is LWORK >= N*N + 2*N.
+*>               ->> For optimal performance (blocked code) the optimal value
+*>               is LWORK >= max(N+(N+1)*NB, N*N+2*N)=N**2+2*N.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ),
+*>                            N*N+LWORK(ZPOCON)).
+*>          2. If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>             (JOBU = 'N')
+*>            2.1   .. no scaled condition estimate requested (JOBE = 'N'):    
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF,
+*>               ZUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3), N+LWORK(ZGESVJ),
+*>                       N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)).
+*>            2.2 .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.      
+*>            -> For optimal performance, 
+*>               LWORK >= max(N+(N+1)*NB, 2*N,2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZGELQF,
+*>               ZUNMLQ. In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3), LWORK(ZPOCON), N+LWORK(ZGESVJ),
+*>                       N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)).   
+*>          3. If SIGMA and the left singular vectors are needed
+*>            3.1  .. no scaled condition estimate requested (JOBE = 'N'):
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). 
+*>            3.2  .. an estimate of the scaled condition number of A is
+*>               required (JOBA='E', or 'G').
+*>            -> the minimal requirement is LWORK >= 3*N.
+*>            -> For optimal performance:
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR.
+*>               In general, the optimal length LWORK is computed as
+*>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZPOCON),
+*>                        2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)).
+*>          4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and 
+*>            4.1. if JOBV = 'V'  
+*>               the minimal requirement is LWORK >= 5*N+2*N*N. 
+*>            4.2. if JOBV = 'J' the minimal requirement is 
+*>               LWORK >= 4*N+N*N.
+*>            In both cases, the allocated CWORK can accommodate blocked runs
+*>            of ZGEQP3, ZGEQRF, ZGELQF, SUNMQR, ZUNMLQ.
+*>
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit CWORK(1) contains the optimal and CWORK(2) contains the
+*>          minimal length of CWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(7,LRWORK))
+*>          On exit,
+*>          RWORK(1) = Determines the scaling factor SCALE = RWORK(2) / RWORK(1)
+*>                    such that SCALE*SVA(1:N) are the computed singular values
+*>                    of A. (See the description of SVA().)
+*>          RWORK(2) = See the description of RWORK(1).
+*>          RWORK(3) = SCONDA is an estimate for the condition number of
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
+*>                    SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*>                    It is computed using ZPOCON. It holds
+*>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*>                    where R is the triangular factor from the QRF of A.
+*>                    However, if R is truncated and the numerical rank is
+*>                    determined to be strictly smaller than N, SCONDA is
+*>                    returned as -1, thus indicating that the smallest
+*>                    singular values might be lost.
+*>
+*>          If full SVD is needed, the following two condition numbers are
+*>          useful for the analysis of the algorithm. They are provided for
+*>          a developer/implementer who is familiar with the details of
+*>          the method.
+*>
+*>          RWORK(4) = an estimate of the scaled condition number of the
+*>                    triangular factor in the first QR factorization.
+*>          RWORK(5) = an estimate of the scaled condition number of the
+*>                    triangular factor in the second QR factorization.
+*>          The following two parameters are computed if JOBT = 'T'.
+*>          They are provided for a developer/implementer who is familiar
+*>          with the details of the method.
+*>          RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy
+*>                    of diag(A^* * A) / Trace(A^* * A) taken as point in the
+*>                    probability simplex.
+*>          RWORK(7) = the entropy of A * A^*. (See the description of RWORK(6).)
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK=-1 or
+*>          LRWORK=-1), then on exit RWORK(1) contains the required length of
+*>          RWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER
+*>          Length of RWORK to confirm proper allocation of workspace.
+*>          LRWORK depends on the job:
+*>
+*>       1. If only the singular values are requested i.e. if
+*>          LSAME(JOBU,'N') .AND. LSAME(JOBV,'N')
+*>          then:
+*>          1.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>               then: LRWORK = max( 7, 2 * M ).
+*>          1.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       2. If singular values with the right singular vectors are requested
+*>          i.e. if
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J')) .AND.
+*>          .NOT.(LSAME(JOBU,'U').OR.LSAME(JOBU,'F'))
+*>          then:
+*>          2.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          2.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       3. If singular values with the left singular vectors are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          .NOT.(LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          3.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          3.2. Otherwise, LRWORK  = max( 7,  N ).
+*>       4. If singular values with both the left and the right singular vectors
+*>          are requested, i.e. if
+*>          (LSAME(JOBU,'U').OR.LSAME(JOBU,'F')) .AND.
+*>          (LSAME(JOBV,'V').OR.LSAME(JOBV,'J'))
+*>          then:
+*>          4.1. If LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G'),
+*>          then LRWORK = max( 7, 2 * M ).
+*>          4.2. Otherwise, LRWORK  = max( 7, N ).
+*>
+*>          If, on entry, LRWORK = -1 or LWORK=-1, a workspace query is assumed and 
+*>          the length of RWORK is returned in RWORK(1). 
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, of dimension at least 4, that further depends 
+*>          on the job:
+*>
+*>          1. If only the singular values are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N.
+*>          2. If the singular values and the right singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          3. If the singular values and the left singular vectors are requested then:
+*>             If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>             then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>          4. If the singular values with both the left and the right singular vectors
+*>             are requested, then:      
+*>             4.1. If LSAME(JOBV,'J') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is N+M; otherwise the length of IWORK is N. 
+*>             4.2. If LSAME(JOBV,'V') the length of IWORK is determined as follows:
+*>                  If ( LSAME(JOBT,'T') .OR. LSAME(JOBA,'F') .OR. LSAME(JOBA,'G') ) 
+*>                  then the length of IWORK is 2*N+M; otherwise the length of IWORK is 2*N.
+*>        
+*>          On exit,
+*>          IWORK(1) = the numerical rank determined after the initial
+*>                     QR factorization with pivoting. See the descriptions
+*>                     of JOBA and JOBR.
+*>          IWORK(2) = the number of the computed nonzero singular values
+*>          IWORK(3) = if nonzero, a warning message:
+*>                     If IWORK(3) = 1 then some of the column norms of A
+*>                     were denormalized floats. The requested high accuracy
+*>                     is not warranted by the data.
+*>          IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to
+*>                     do the job as specified by the JOB parameters.
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK = -1 or
+*>          LRWORK = -1), then on exit IWORK(1) contains the required length of 
+*>          IWORK for the job parameters used in the call.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  ZGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GEsing
+*
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*>  ZGEJSV implements a preconditioned Jacobi SVD algorithm. It uses ZGEQP3,
+*>  ZGEQRF, and ZGELQF as preprocessors and preconditioners. Optionally, an
+*>  additional row pivoting can be used as a preprocessor, which in some
+*>  cases results in much higher accuracy. An example is matrix A with the
+*>  structure A = D1 * C * D2, where D1, D2 are arbitrarily ill-conditioned
+*>  diagonal matrices and C is well-conditioned matrix. In that case, complete
+*>  pivoting in the first QR factorizations provides accuracy dependent on the
+*>  condition number of C, and independent of D1, D2. Such higher accuracy is
+*>  not completely understood theoretically, but it works well in practice.
+*>  Further, if A can be written as A = B*D, with well-conditioned B and some
+*>  diagonal D, then the high accuracy is guaranteed, both theoretically and
+*>  in software, independent of D. For more details see [1], [2].
+*>     The computational range for the singular values can be the full range
+*>  ( UNDERFLOW,OVERFLOW ), provided that the machine arithmetic and the BLAS
+*>  & LAPACK routines called by ZGEJSV are implemented to work in that range.
+*>  If that is not the case, then the restriction for safe computation with
+*>  the singular values in the range of normalized IEEE numbers is that the
+*>  spectral condition number kappa(A)=sigma_max(A)/sigma_min(A) does not
+*>  overflow. This code (ZGEJSV) is best used in this restricted range,
+*>  meaning that singular values of magnitude below ||A||_2 / DLAMCH('O') are
+*>  returned as zeros. See JOBR for details on this.
+*>     Further, this implementation is somewhat slower than the one described
+*>  in [1,2] due to replacement of some non-LAPACK components, and because
+*>  the choice of some tuning parameters in the iterative part (ZGESVJ) is
+*>  left to the implementer on a particular machine.
+*>     The rank revealing QR factorization (in this code: ZGEQP3) should be
+*>  implemented as in [3]. We have a new version of ZGEQP3 under development
+*>  that is more robust than the current one in LAPACK, with a cleaner cut in
+*>  rank deficient cases. It will be available in the SIGMA library [4].
+*>  If M is much larger than N, it is obvious that the initial QRF with
+*>  column pivoting can be preprocessed by the QRF without pivoting. That
+*>  well known trick is not used in ZGEJSV because in some cases heavy row
+*>  weighting can be treated with complete pivoting. The overhead in cases
+*>  M much larger than N is then only due to pivoting, but the benefits in
+*>  terms of accuracy have prevailed. The implementer/user can incorporate
+*>  this extra QRF step easily. The implementer can also improve data movement
+*>  (matrix transpose, matrix copy, matrix transposed copy) - this
+*>  implementation of ZGEJSV uses only the simplest, naive data movement.
+*> \endverbatim
+*
+*> \par Contributor:
+*  ==================
+*>
+*>  Zlatko Drmac, Department of Mathematics, Faculty of Science,
+*>  University of Zagreb (Zagreb, Croatia); drmac@math.hr
+*
+*> \par References:
+*  ================
+*>
+*> \verbatim
+*>
+*> [1] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm I.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1322-1342.
+*>     LAPACK Working note 169.
+*> [2] Z. Drmac and K. Veselic: New fast and accurate Jacobi SVD algorithm II.
+*>     SIAM J. Matrix Anal. Appl. Vol. 35, No. 2 (2008), pp. 1343-1362.
+*>     LAPACK Working note 170.
+*> [3] Z. Drmac and Z. Bujanovic: On the failure of rank-revealing QR
+*>     factorization software - a case study.
+*>     ACM Trans. Math. Softw. Vol. 35, No 2 (2008), pp. 1-28.
+*>     LAPACK Working note 176.
+*> [4] Z. Drmac: SIGMA - mathematical software library for accurate SVD, PSV,
+*>     QSVD, (H,K)-SVD computations.
+*>     Department of Mathematics, University of Zagreb, 2008, 2016.
+*> \endverbatim
+*
+*>  \par Bugs, examples and comments:
+*   =================================
+*>
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*>
+*  =====================================================================
+      SUBROUTINE ZGEJSV( JOBA, JOBU, JOBV, JOBR, JOBT, JOBP,
+     $                   M, N, A, LDA, SVA, U, LDU, V, LDV,
+     $                   CWORK, LWORK, RWORK, LRWORK, IWORK, INFO )
+*
+*  -- LAPACK computational routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      INTEGER     INFO, LDA, LDU, LDV, LWORK, LRWORK, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16       A( LDA, * ), U( LDU, * ), V( LDV, * ),
+     $                 CWORK( LWORK )
+      DOUBLE PRECISION SVA( N ), RWORK( LRWORK )
+      INTEGER          IWORK( * )
+      CHARACTER*1      JOBA, JOBP, JOBR, JOBT, JOBU, JOBV
+*     ..
+*
+*  ===========================================================================
+*
+*     .. Local Parameters ..
+      DOUBLE PRECISION ZERO, ONE
+      PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 )
+      COMPLEX*16 CZERO, CONE
+      PARAMETER ( CZERO = ( 0.0D0, 0.0D0 ), CONE = ( 1.0D0, 0.0D0 ) )
+*     ..
+*     .. Local Scalars ..
+      COMPLEX*16       CTEMP
+      DOUBLE PRECISION AAPP,    AAQQ,   AATMAX, AATMIN, BIG,    BIG1,
+     $                 COND_OK, CONDR1, CONDR2, ENTRA,  ENTRAT, EPSLN,
+     $                 MAXPRJ,  SCALEM, SCONDA, SFMIN,  SMALL,  TEMP1,
+     $                 USCAL1,  USCAL2, XSC
+      INTEGER IERR,   N1,     NR,     NUMRANK,        p, q,   WARNING
+      LOGICAL ALMORT, DEFR,   ERREST, GOSCAL,  JRACC,  KILL,   LQUERY,
+     $        LSVEC,  L2ABER, L2KILL, L2PERT,  L2RANK, L2TRAN, NOSCAL,
+     $        ROWPIV, RSVEC,  TRANSP
+*
+      INTEGER OPTWRK, MINWRK, MINRWRK, MINIWRK
+      INTEGER LWCON,  LWLQF, LWQP3, LWQRF, LWUNMLQ, LWUNMQR, LWUNMQRM,
+     $        LWSVDJ, LWSVDJV, LRWQP3, LRWCON, LRWSVDJ, IWOFF
+      INTEGER LWRK_ZGELQF, LWRK_ZGEQP3,  LWRK_ZGEQP3N, LWRK_ZGEQRF,  
+     $        LWRK_ZGESVJ, LWRK_ZGESVJV, LWRK_ZGESVJU, LWRK_ZUNMLQ, 
+     $        LWRK_ZUNMQR, LWRK_ZUNMQRM    
+*     ..
+*     .. Local Arrays
+      COMPLEX*16         CDUMMY(1)
+      DOUBLE PRECISION   RDUMMY(1)
+*
+*     .. Intrinsic Functions ..
+      INTRINSIC ABS, DCMPLX, CONJG, DLOG, MAX, MIN, DBLE, NINT, SQRT
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION      DLAMCH, DZNRM2
+      INTEGER   IDAMAX, IZAMAX
+      LOGICAL   LSAME
+      EXTERNAL  IDAMAX, IZAMAX, LSAME, DLAMCH, DZNRM2
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL  DLASSQ, ZCOPY,  ZGELQF, ZGEQP3, ZGEQRF, ZLACPY, ZLAPMR,
+     $          ZLASCL, DLASCL, ZLASET, ZLASSQ, ZLASWP, ZUNGQR, ZUNMLQ,
+     $          ZUNMQR, ZPOCON, DSCAL,  ZDSCAL, ZSWAP,  ZTRSM,  ZLACGV,
+     $          XERBLA
+*
+      EXTERNAL  ZGESVJ
+*     ..
+*
+*     Test the input arguments
+*
+      LSVEC  = LSAME( JOBU, 'U' ) .OR. LSAME( JOBU, 'F' )
+      JRACC  = LSAME( JOBV, 'J' )
+      RSVEC  = LSAME( JOBV, 'V' ) .OR. JRACC
+      ROWPIV = LSAME( JOBA, 'F' ) .OR. LSAME( JOBA, 'G' )
+      L2RANK = LSAME( JOBA, 'R' )
+      L2ABER = LSAME( JOBA, 'A' )
+      ERREST = LSAME( JOBA, 'E' ) .OR. LSAME( JOBA, 'G' )
+      L2TRAN = LSAME( JOBT, 'T' ) .AND. ( M .EQ. N )
+      L2KILL = LSAME( JOBR, 'R' )
+      DEFR   = LSAME( JOBR, 'N' )
+      L2PERT = LSAME( JOBP, 'P' )
+*
+      LQUERY = ( LWORK .EQ. -1 ) .OR. ( LRWORK .EQ. -1 )
+*
+      IF ( .NOT.(ROWPIV .OR. L2RANK .OR. L2ABER .OR.
+     $     ERREST .OR. LSAME( JOBA, 'C' ) )) THEN
+         INFO = - 1
+      ELSE IF ( .NOT.( LSVEC .OR. LSAME( JOBU, 'N' ) .OR.
+     $   ( LSAME( JOBU, 'W' ) .AND. RSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 2
+      ELSE IF ( .NOT.( RSVEC .OR. LSAME( JOBV, 'N' ) .OR.
+     $   ( LSAME( JOBV, 'W' ) .AND. LSVEC .AND. L2TRAN ) ) ) THEN
+         INFO = - 3
+      ELSE IF ( .NOT. ( L2KILL .OR. DEFR ) )    THEN
+         INFO = - 4
+      ELSE IF ( .NOT. ( LSAME(JOBT,'T') .OR. LSAME(JOBT,'N') ) ) THEN
+         INFO = - 5
+      ELSE IF ( .NOT. ( L2PERT .OR. LSAME( JOBP, 'N' ) ) ) THEN
+         INFO = - 6
+      ELSE IF ( M .LT. 0 ) THEN
+         INFO = - 7
+      ELSE IF ( ( N .LT. 0 ) .OR. ( N .GT. M ) ) THEN
+         INFO = - 8
+      ELSE IF ( LDA .LT. M ) THEN
+         INFO = - 10
+      ELSE IF ( LSVEC .AND. ( LDU .LT. M ) ) THEN
+         INFO = - 13
+      ELSE IF ( RSVEC .AND. ( LDV .LT. N ) ) THEN
+         INFO = - 15
+      ELSE
+*        #:)
+         INFO = 0
+      END IF
+*
+      IF ( INFO .EQ. 0 ) THEN 
+*         .. compute the minimal and the optimal workspace lengths 
+*         [[The expressions for computing the minimal and the optimal
+*         values of LCWORK, LRWORK are written with a lot of redundancy and
+*         can be simplified. However, this verbose form is useful for
+*         maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for ZGEQP3 of an M x N matrix,
+*         ZGEQRF of an N x N matrix, ZGELQF of an N x N matrix,
+*         ZUNMLQ for computing N x N matrix, ZUNMQR for computing N x N
+*         matrix, ZUNMQR for computing M x N matrix, respectively.
+          LWQP3 = N+1   
+          LWQRF = MAX( 1, N )
+          LWLQF = MAX( 1, N )
+          LWUNMLQ  = MAX( 1, N )
+          LWUNMQR  = MAX( 1, N )
+          LWUNMQRM = MAX( 1, M )
+*        .. minimal workspace length for ZPOCON of an N x N matrix
+          LWCON = 2 * N 
+*        .. minimal workspace length for ZGESVJ of an N x N matrix,
+*         without and with explicit accumulation of Jacobi rotations
+          LWSVDJ  = MAX( 2 * N, 1 )         
+          LWSVDJV = MAX( 2 * N, 1 )
+*         .. minimal REAL workspace length for ZGEQP3, ZPOCON, ZGESVJ
+          LRWQP3  = 2 * N 
+          LRWCON  = N 
+          LRWSVDJ = N 
+          IF ( LQUERY ) THEN 
+              CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1, 
+     $             RDUMMY, IERR )
+              LWRK_ZGEQP3 = INT( CDUMMY(1) )
+              CALL ZGEQRF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_ZGEQRF = INT( CDUMMY(1) )
+              CALL ZGELQF( N, N, A, LDA, CDUMMY, CDUMMY,-1, IERR )
+              LWRK_ZGELQF = INT( CDUMMY(1) )
+          END IF
+          MINWRK  = 2
+          OPTWRK  = 2
+          MINIWRK = N 
+          IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN
+*             .. minimal and optimal sizes of the complex workspace if
+*             only the singular values are requested
+              IF ( ERREST ) THEN 
+                  MINWRK = MAX( N+LWQP3, N**2+LWCON, N+LWQRF, LWSVDJ )
+              ELSE
+                  MINWRK = MAX( N+LWQP3, N+LWQRF, LWSVDJ )
+              END IF
+              IF ( LQUERY ) THEN 
+                  CALL ZGESVJ( 'L', 'N', 'N', N, N, A, LDA, SVA, N, V, 
+     $                 LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                  LWRK_ZGESVJ = INT( CDUMMY(1) )
+                  IF ( ERREST ) THEN 
+                      OPTWRK = MAX( N+LWRK_ZGEQP3, N**2+LWCON, 
+     $                              N+LWRK_ZGEQRF, LWRK_ZGESVJ )
+                  ELSE
+                      OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWRK_ZGEQRF, 
+     $                              LWRK_ZGESVJ )
+                  END IF
+              END IF
+              IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                  END IF                 
+              ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWCON, LRWSVDJ )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                  END IF
+              END IF   
+              IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M 
+          ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( ERREST ) THEN 
+                 MINWRK = MAX( N+LWQP3, LWCON, LWSVDJ, N+LWLQF,  
+     $                         2*N+LWQRF, N+LWSVDJ, N+LWUNMLQ )
+             ELSE
+                 MINWRK = MAX( N+LWQP3, LWSVDJ, N+LWLQF, 2*N+LWQRF, 
+     $                         N+LWSVDJ, N+LWUNMLQ )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_ZGESVJ = INT( CDUMMY(1) )
+                 CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                V, LDV, CDUMMY, -1, IERR )
+                 LWRK_ZUNMLQ = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN 
+                 OPTWRK = MAX( N+LWRK_ZGEQP3, LWCON, LWRK_ZGESVJ, 
+     $                         N+LWRK_ZGELQF, 2*N+LWRK_ZGEQRF,
+     $                         N+LWRK_ZGESVJ,  N+LWRK_ZUNMLQ )
+                 ELSE
+                 OPTWRK = MAX( N+LWRK_ZGEQP3, LWRK_ZGESVJ,N+LWRK_ZGELQF,
+     $                         2*N+LWRK_ZGEQRF, N+LWRK_ZGESVJ, 
+     $                         N+LWRK_ZUNMLQ )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ ) 
+                  END IF                  
+             ELSE
+                  IF ( ERREST ) THEN 
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                  ELSE
+                     MINRWRK = MAX( 7, LRWQP3, LRWSVDJ ) 
+                  END IF                 
+             END IF
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN  
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( ERREST ) THEN
+                 MINWRK = N + MAX( LWQP3,LWCON,N+LWQRF,LWSVDJ,LWUNMQRM )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, N+LWQRF, LWSVDJ, LWUNMQRM )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZGESVJ( 'L', 'U', 'N', N,N, U, LDU, SVA, N, A,
+     $                LDA, CDUMMY, -1, RDUMMY, -1, IERR )
+                 LWRK_ZGESVJ = INT( CDUMMY(1) )
+                 CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $               LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQRM = INT( CDUMMY(1) )
+                 IF ( ERREST ) THEN
+                 OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, N+LWRK_ZGEQRF,
+     $                             LWRK_ZGESVJ, LWRK_ZUNMQRM )
+                 ELSE
+                 OPTWRK = N + MAX( LWRK_ZGEQP3, N+LWRK_ZGEQRF,
+     $                             LWRK_ZGESVJ, LWRK_ZUNMQRM )
+                 END IF
+             END IF
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ )
+                 END IF                 
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+                 ELSE
+                    MINRWRK = MAX( 7, LRWQP3, LRWSVDJ )
+                 END IF                
+             END IF 
+             IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+          ELSE
+*            .. minimal and optimal sizes of the complex workspace if the
+*            full SVD is requested
+             IF ( .NOT. JRACC ) THEN                
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON,  2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV, 
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ, 
+     $                         N+N**2+LWSVDJ,   N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3,        2*N+N**2+LWCON, 
+     $                         2*N+LWQRF,         2*N+LWQP3, 
+     $                         2*N+N**2+N+LWLQF,  2*N+N**2+N+N**2+LWCON,
+     $                         2*N+N**2+N+LWSVDJ, 2*N+N**2+N+LWSVDJV,
+     $                         2*N+N**2+N+LWUNMQR,2*N+N**2+N+LWUNMLQ,
+     $                         N+N**2+LWSVDJ,      N+LWUNMQRM ) 
+                 END IF 
+                 MINIWRK = MINIWRK + N 
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             ELSE
+                 IF ( ERREST ) THEN 
+                    MINWRK = MAX( N+LWQP3, N+LWCON, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM )
+                 ELSE
+                    MINWRK = MAX( N+LWQP3, 2*N+LWQRF, 
+     $                         2*N+N**2+LWSVDJV, 2*N+N**2+N+LWUNMQR, 
+     $                         N+LWUNMQRM ) 
+                 END IF   
+                 IF ( ROWPIV .OR. L2TRAN ) MINIWRK = MINIWRK + M
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQRM = INT( CDUMMY(1) )
+                 CALL ZUNMQR( 'L', 'N', N, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQR = INT( CDUMMY(1) )
+                 IF ( .NOT. JRACC ) THEN
+                     CALL ZGEQP3( N,N, A, LDA, IWORK, CDUMMY,CDUMMY, -1,
+     $                    RDUMMY, IERR )
+                     LWRK_ZGEQP3N = INT( CDUMMY(1) )
+                     CALL ZGESVJ( 'L', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJ = INT( CDUMMY(1) )
+                     CALL ZGESVJ( 'U', 'U', 'N', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJU = INT( CDUMMY(1) )
+                     CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJV = INT( CDUMMY(1) )
+                     CALL ZUNMLQ( 'L', 'C', N, N, N, A, LDA, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_ZUNMLQ = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                       OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, 
+     $                          2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, 
+     $                          2*N+LWRK_ZGEQP3N, 
+     $                          2*N+N**2+N+LWRK_ZGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_ZGESVJ, 
+     $                          2*N+N**2+N+LWRK_ZGESVJV,               
+     $                          2*N+N**2+N+LWRK_ZUNMQR,
+     $                          2*N+N**2+N+LWRK_ZUNMLQ, 
+     $                          N+N**2+LWRK_ZGESVJU,                  
+     $                          N+LWRK_ZUNMQRM )
+                     ELSE
+                       OPTWRK = MAX( N+LWRK_ZGEQP3,  
+     $                          2*N+N**2+LWCON, 2*N+LWRK_ZGEQRF, 
+     $                          2*N+LWRK_ZGEQP3N, 
+     $                          2*N+N**2+N+LWRK_ZGELQF,  
+     $                          2*N+N**2+N+N**2+LWCON,
+     $                          2*N+N**2+N+LWRK_ZGESVJ,               
+     $                          2*N+N**2+N+LWRK_ZGESVJV, 
+     $                          2*N+N**2+N+LWRK_ZUNMQR,
+     $                          2*N+N**2+N+LWRK_ZUNMLQ, 
+     $                          N+N**2+LWRK_ZGESVJU,
+     $                          N+LWRK_ZUNMQRM )
+                     END IF                    
+                 ELSE
+                     CALL ZGESVJ( 'L', 'U', 'V', N, N, U, LDU, SVA,
+     $                    N, V, LDV, CDUMMY, -1, RDUMMY, -1, IERR )
+                     LWRK_ZGESVJV = INT( CDUMMY(1) )
+                     CALL ZUNMQR( 'L', 'N', N, N, N, CDUMMY, N, CDUMMY,
+     $                    V, LDV, CDUMMY, -1, IERR )
+                     LWRK_ZUNMQR = INT( CDUMMY(1) )
+                     CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                    LDU, CDUMMY, -1, IERR )
+                     LWRK_ZUNMQRM = INT( CDUMMY(1) )
+                     IF ( ERREST ) THEN 
+                        OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON,   
+     $                           2*N+LWRK_ZGEQRF, 2*N+N**2,  
+     $                           2*N+N**2+LWRK_ZGESVJV,  
+     $                           2*N+N**2+N+LWRK_ZUNMQR,N+LWRK_ZUNMQRM )
+                     ELSE
+                        OPTWRK = MAX( N+LWRK_ZGEQP3, 2*N+LWRK_ZGEQRF,  
+     $                           2*N+N**2, 2*N+N**2+LWRK_ZGESVJV, 
+     $                           2*N+N**2+N+LWRK_ZUNMQR, 
+     $                           N+LWRK_ZUNMQRM )   
+                     END IF                  
+                 END IF               
+             END IF 
+             IF ( L2TRAN .OR. ROWPIV ) THEN 
+                 MINRWRK = MAX( 7, 2*M,  LRWQP3, LRWSVDJ, LRWCON )
+             ELSE
+                 MINRWRK = MAX( 7, LRWQP3, LRWSVDJ, LRWCON )
+             END IF 
+          END IF
+          MINWRK = MAX( 2, MINWRK )
+          OPTWRK = MAX( MINWRK, OPTWRK )
+          IF ( LWORK  .LT. MINWRK  .AND. (.NOT.LQUERY) ) INFO = - 17
+          IF ( LRWORK .LT. MINRWRK .AND. (.NOT.LQUERY) ) INFO = - 19   
+      END IF
+*      
+      IF ( INFO .NE. 0 ) THEN
+*       #:(
+         CALL XERBLA( 'ZGEJSV', - INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+          CWORK(1) = OPTWRK
+          CWORK(2) = MINWRK
+          RWORK(1) = MINRWRK
+          IWORK(1) = MAX( 4, MINIWRK )
+          RETURN   
+      END IF
+*
+*     Quick return for void matrix (Y3K safe)
+* #:)
+      IF ( ( M .EQ. 0 ) .OR. ( N .EQ. 0 ) ) THEN
+         IWORK(1:4) = 0
+         RWORK(1:7) = 0
+         RETURN
+      ENDIF
+*
+*     Determine whether the matrix U should be M x N or M x M
+*
+      IF ( LSVEC ) THEN
+         N1 = N
+         IF ( LSAME( JOBU, 'F' ) ) N1 = M
+      END IF
+*
+*     Set numerical parameters
+*
+*!    NOTE: Make sure DLAMCH() does not fail on the target architecture.
+*
+      EPSLN = DLAMCH('Epsilon')
+      SFMIN = DLAMCH('SafeMinimum')
+      SMALL = SFMIN / EPSLN
+      BIG   = DLAMCH('O')
+*     BIG   = ONE / SFMIN
+*
+*     Initialize SVA(1:N) = diag( ||A e_i||_2 )_1^N
+*
+*(!)  If necessary, scale SVA() to protect the largest norm from
+*     overflow. It is possible that this scaling pushes the smallest
+*     column norm left from the underflow threshold (extreme case).
+*
+      SCALEM  = ONE / SQRT(DBLE(M)*DBLE(N))
+      NOSCAL  = .TRUE.
+      GOSCAL  = .TRUE.
+      DO 1874 p = 1, N
+         AAPP = ZERO
+         AAQQ = ONE
+         CALL ZLASSQ( M, A(1,p), 1, AAPP, AAQQ )
+         IF ( AAPP .GT. BIG ) THEN
+            INFO = - 9
+            CALL XERBLA( 'ZGEJSV', -INFO )
+            RETURN
+         END IF
+         AAQQ = SQRT(AAQQ)
+         IF ( ( AAPP .LT. (BIG / AAQQ) ) .AND. NOSCAL  ) THEN
+            SVA(p)  = AAPP * AAQQ
+         ELSE
+            NOSCAL  = .FALSE.
+            SVA(p)  = AAPP * ( AAQQ * SCALEM )
+            IF ( GOSCAL ) THEN
+               GOSCAL = .FALSE.
+               CALL DSCAL( p-1, SCALEM, SVA, 1 )
+            END IF
+         END IF
+ 1874 CONTINUE
+*
+      IF ( NOSCAL ) SCALEM = ONE
+*
+      AAPP = ZERO
+      AAQQ = BIG
+      DO 4781 p = 1, N
+         AAPP = MAX( AAPP, SVA(p) )
+         IF ( SVA(p) .NE. ZERO ) AAQQ = MIN( AAQQ, SVA(p) )
+ 4781 CONTINUE
+*
+*     Quick return for zero M x N matrix
+* #:)
+      IF ( AAPP .EQ. ZERO ) THEN
+         IF ( LSVEC ) CALL ZLASET( 'G', M, N1, CZERO, CONE, U, LDU )
+         IF ( RSVEC ) CALL ZLASET( 'G', N, N,  CZERO, CONE, V, LDV )
+         RWORK(1) = ONE
+         RWORK(2) = ONE
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         IWORK(1) = 0
+         IWORK(2) = 0
+         IWORK(3) = 0
+         IWORK(4) = -1
+         RETURN
+      END IF
+*
+*     Issue warning if denormalized column norms detected. Override the
+*     high relative accuracy request. Issue licence to kill nonzero columns
+*     (set them to zero) whose norm is less than sigma_max / BIG (roughly).
+* #:(
+      WARNING = 0
+      IF ( AAQQ .LE. SFMIN ) THEN
+         L2RANK = .TRUE.
+         L2KILL = .TRUE.
+         WARNING = 1
+      END IF
+*
+*     Quick return for one-column matrix
+* #:)
+      IF ( N .EQ. 1 ) THEN
+*
+         IF ( LSVEC ) THEN
+            CALL ZLASCL( 'G',0,0,SVA(1),SCALEM, M,1,A(1,1),LDA,IERR )
+            CALL ZLACPY( 'A', M, 1, A, LDA, U, LDU )
+*           computing all M left singular vectors of the M x 1 matrix
+            IF ( N1 .NE. N  ) THEN
+              CALL ZGEQRF( M, N, U,LDU, CWORK, CWORK(N+1),LWORK-N,IERR )
+              CALL ZUNGQR( M,N1,1, U,LDU,CWORK,CWORK(N+1),LWORK-N,IERR )
+              CALL ZCOPY( M, A(1,1), 1, U(1,1), 1 )
+            END IF
+         END IF
+         IF ( RSVEC ) THEN
+             V(1,1) = CONE
+         END IF
+         IF ( SVA(1) .LT. (BIG*SCALEM) ) THEN
+            SVA(1)  = SVA(1) / SCALEM
+            SCALEM  = ONE
+         END IF
+         RWORK(1) = ONE / SCALEM
+         RWORK(2) = ONE
+         IF ( SVA(1) .NE. ZERO ) THEN
+            IWORK(1) = 1
+            IF ( ( SVA(1) / SCALEM) .GE. SFMIN ) THEN
+               IWORK(2) = 1
+            ELSE
+               IWORK(2) = 0
+            END IF
+         ELSE
+            IWORK(1) = 0
+            IWORK(2) = 0
+         END IF
+         IWORK(3) = 0
+         IWORK(4) = -1
+         IF ( ERREST ) RWORK(3) = ONE
+         IF ( LSVEC .AND. RSVEC ) THEN
+            RWORK(4) = ONE
+            RWORK(5) = ONE
+         END IF
+         IF ( L2TRAN ) THEN
+            RWORK(6) = ZERO
+            RWORK(7) = ZERO
+         END IF
+         RETURN
+*
+      END IF
+*
+      TRANSP = .FALSE.
+*
+      AATMAX = -ONE
+      AATMIN =  BIG
+      IF ( ROWPIV .OR. L2TRAN ) THEN
+*
+*     Compute the row norms, needed to determine row pivoting sequence
+*     (in the case of heavily row weighted A, row pivoting is strongly
+*     advised) and to collect information needed to compare the
+*     structures of A * A^* and A^* * A (in the case L2TRAN.EQ..TRUE.).
+*
+         IF ( L2TRAN ) THEN
+            DO 1950 p = 1, M
+               XSC   = ZERO
+               TEMP1 = ONE
+               CALL ZLASSQ( N, A(p,1), LDA, XSC, TEMP1 )
+*              ZLASSQ gets both the ell_2 and the ell_infinity norm
+*              in one pass through the vector
+               RWORK(M+p)  = XSC * SCALEM
+               RWORK(p)    = XSC * (SCALEM*SQRT(TEMP1))
+               AATMAX = MAX( AATMAX, RWORK(p) )
+               IF (RWORK(p) .NE. ZERO) 
+     $            AATMIN = MIN(AATMIN,RWORK(p))
+ 1950       CONTINUE
+         ELSE
+            DO 1904 p = 1, M
+               RWORK(M+p) = SCALEM*ABS( A(p,IZAMAX(N,A(p,1),LDA)) )
+               AATMAX = MAX( AATMAX, RWORK(M+p) )
+               AATMIN = MIN( AATMIN, RWORK(M+p) )
+ 1904       CONTINUE
+         END IF
+*
+      END IF
+*
+*     For square matrix A try to determine whether A^*  would be better
+*     input for the preconditioned Jacobi SVD, with faster convergence.
+*     The decision is based on an O(N) function of the vector of column
+*     and row norms of A, based on the Shannon entropy. This should give
+*     the right choice in most cases when the difference actually matters.
+*     It may fail and pick the slower converging side.
+*
+      ENTRA  = ZERO
+      ENTRAT = ZERO
+      IF ( L2TRAN ) THEN
+*
+         XSC   = ZERO
+         TEMP1 = ONE
+         CALL DLASSQ( N, SVA, 1, XSC, TEMP1 )
+         TEMP1 = ONE / TEMP1
+*
+         ENTRA = ZERO
+         DO 1113 p = 1, N
+            BIG1  = ( ( SVA(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRA = ENTRA + BIG1 * DLOG(BIG1)
+ 1113    CONTINUE
+         ENTRA = - ENTRA / DLOG(DBLE(N))
+*
+*        Now, SVA().^2/Trace(A^* * A) is a point in the probability simplex.
+*        It is derived from the diagonal of  A^* * A.  Do the same with the
+*        diagonal of A * A^*, compute the entropy of the corresponding
+*        probability distribution. Note that A * A^* and A^* * A have the
+*        same trace.
+*
+         ENTRAT = ZERO
+         DO 1114 p = 1, M
+            BIG1 = ( ( RWORK(p) / XSC )**2 ) * TEMP1
+            IF ( BIG1 .NE. ZERO ) ENTRAT = ENTRAT + BIG1 * DLOG(BIG1)
+ 1114    CONTINUE
+         ENTRAT = - ENTRAT / DLOG(DBLE(M))
+*
+*        Analyze the entropies and decide A or A^*. Smaller entropy
+*        usually means better input for the algorithm.
+*
+         TRANSP = ( ENTRAT .LT. ENTRA )
+* 
+*        If A^* is better than A, take the adjoint of A. This is allowed
+*        only for square matrices, M=N.
+         IF ( TRANSP ) THEN
+*           In an optimal implementation, this trivial transpose
+*           should be replaced with faster transpose.
+            DO 1115 p = 1, N - 1
+               A(p,p) = CONJG(A(p,p))
+               DO 1116 q = p + 1, N
+                   CTEMP = CONJG(A(q,p))
+                  A(q,p) = CONJG(A(p,q))
+                  A(p,q) = CTEMP
+ 1116          CONTINUE
+ 1115       CONTINUE
+            A(N,N) = CONJG(A(N,N))
+            DO 1117 p = 1, N
+               RWORK(M+p) = SVA(p)
+               SVA(p)     = RWORK(p)
+*              previously computed row 2-norms are now column 2-norms
+*              of the transposed matrix
+ 1117       CONTINUE
+            TEMP1  = AAPP
+            AAPP   = AATMAX
+            AATMAX = TEMP1
+            TEMP1  = AAQQ
+            AAQQ   = AATMIN
+            AATMIN = TEMP1
+            KILL   = LSVEC
+            LSVEC  = RSVEC
+            RSVEC  = KILL
+            IF ( LSVEC ) N1 = N
+*
+            ROWPIV = .TRUE.
+         END IF
+*
+      END IF
+*     END IF L2TRAN
+*
+*     Scale the matrix so that its maximal singular value remains less
+*     than SQRT(BIG) -- the matrix is scaled so that its maximal column
+*     has Euclidean norm equal to SQRT(BIG/N). The only reason to keep
+*     SQRT(BIG) instead of BIG is the fact that ZGEJSV uses LAPACK and
+*     BLAS routines that, in some implementations, are not capable of
+*     working in the full interval [SFMIN,BIG] and that they may provoke
+*     overflows in the intermediate results. If the singular values spread
+*     from SFMIN to BIG, then ZGESVJ will compute them. So, in that case,
+*     one should use ZGESVJ instead of ZGEJSV.
+*     >> change in the April 2016 update: allow bigger range, i.e. the
+*     largest column is allowed up to BIG/N and ZGESVJ will do the rest.
+      BIG1   = SQRT( BIG )
+      TEMP1  = SQRT( BIG / DBLE(N) ) 
+*      TEMP1  = BIG/DBLE(N)
+*
+      CALL DLASCL( 'G', 0, 0, AAPP, TEMP1, N, 1, SVA, N, IERR )
+      IF ( AAQQ .GT. (AAPP * SFMIN) ) THEN
+          AAQQ = ( AAQQ / AAPP ) * TEMP1
+      ELSE
+          AAQQ = ( AAQQ * TEMP1 ) / AAPP
+      END IF
+      TEMP1 = TEMP1 * SCALEM
+      CALL ZLASCL( 'G', 0, 0, AAPP, TEMP1, M, N, A, LDA, IERR )
+*
+*     To undo scaling at the end of this procedure, multiply the
+*     computed singular values with USCAL2 / USCAL1.
+*
+      USCAL1 = TEMP1
+      USCAL2 = AAPP
+*
+      IF ( L2KILL ) THEN
+*        L2KILL enforces computation of nonzero singular values in
+*        the restricted range of condition number of the initial A,
+*        sigma_max(A) / sigma_min(A) approx. SQRT(BIG)/SQRT(SFMIN).
+         XSC = SQRT( SFMIN )
+      ELSE
+         XSC = SMALL
+*
+*        Now, if the condition number of A is too big,
+*        sigma_max(A) / sigma_min(A) .GT. SQRT(BIG/N) * EPSLN / SFMIN,
+*        as a precaution measure, the full SVD is computed using ZGESVJ
+*        with accumulated Jacobi rotations. This provides numerically
+*        more robust computation, at the cost of slightly increased run
+*        time. Depending on the concrete implementation of BLAS and LAPACK
+*        (i.e. how they behave in presence of extreme ill-conditioning) the
+*        implementor may decide to remove this switch.
+         IF ( ( AAQQ.LT.SQRT(SFMIN) ) .AND. LSVEC .AND. RSVEC ) THEN
+            JRACC = .TRUE.
+         END IF
+*
+      END IF
+      IF ( AAQQ .LT. XSC ) THEN
+         DO 700 p = 1, N
+            IF ( SVA(p) .LT. XSC ) THEN
+               CALL ZLASET( 'A', M, 1, CZERO, CZERO, A(1,p), LDA )
+               SVA(p) = ZERO
+            END IF
+ 700     CONTINUE
+      END IF
+*
+*     Preconditioning using QR factorization with pivoting
+*
+      IF ( ROWPIV ) THEN
+*        Optional row permutation (Bjoerck row pivoting):
+*        A result by Cox and Higham shows that the Bjoerck's
+*        row pivoting combined with standard column pivoting
+*        has similar effect as Powell-Reid complete pivoting.
+*        The ell-infinity norms of A are made nonincreasing.
+         IF ( ( LSVEC .AND. RSVEC ) .AND. .NOT.( JRACC ) ) THEN 
+              IWOFF = 2*N
+         ELSE
+              IWOFF = N
+         END IF
+         DO 1952 p = 1, M - 1
+            q = IDAMAX( M-p+1, RWORK(M+p), 1 ) + p - 1
+            IWORK(IWOFF+p) = q
+            IF ( p .NE. q ) THEN
+               TEMP1      = RWORK(M+p)
+               RWORK(M+p) = RWORK(M+q)
+               RWORK(M+q) = TEMP1
+            END IF
+ 1952    CONTINUE
+         CALL ZLASWP( N, A, LDA, 1, M-1, IWORK(IWOFF+1), 1 )
+      END IF
+*
+*     End of the preparation phase (scaling, optional sorting and
+*     transposing, optional flushing of small columns).
+*
+*     Preconditioning
+*
+*     If the full SVD is needed, the right singular vectors are computed
+*     from a matrix equation, and for that we need theoretical analysis
+*     of the Businger-Golub pivoting. So we use ZGEQP3 as the first RR QRF.
+*     In all other cases the first RR QRF can be chosen by other criteria
+*     (eg speed by replacing global with restricted window pivoting, such
+*     as in xGEQPX from TOMS # 782). Good results will be obtained using
+*     xGEQPX with properly (!) chosen numerical parameters.
+*     Any improvement of ZGEQP3 improves overall performance of ZGEJSV.
+*
+*     A * P1 = Q1 * [ R1^* 0]^*:
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL ZGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LWORK-N,
+     $             RWORK, IERR )
+*
+*     The upper triangular matrix R1 from the first QRF is inspected for
+*     rank deficiency and possibilities for deflation, or possible
+*     ill-conditioning. Depending on the user specified flag L2RANK,
+*     the procedure explores possibilities to reduce the numerical
+*     rank by inspecting the computed upper triangular factor. If
+*     L2RANK or L2ABER are up, then ZGEJSV will compute the SVD of
+*     A + dA, where ||dA|| <= f(M,N)*EPSLN.
+*
+      NR = 1
+      IF ( L2ABER ) THEN
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPSLN*||A||.
+         TEMP1 = SQRT(DBLE(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .GE. (TEMP1*ABS(A(1,1))) ) THEN
+               NR = NR + 1
+            ELSE
+               GO TO 3002
+            END IF
+ 3001    CONTINUE
+ 3002    CONTINUE
+      ELSE IF ( L2RANK ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R1 is used as the criterion for
+*        close-to-rank-deficient.
+         TEMP1 = SQRT(SFMIN)
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        The goal is high relative accuracy. However, if the matrix
+*        has high scaled condition number the relative accuracy is in
+*        general not feasible. Later on, a condition number estimator
+*        will be deployed to estimate the scaled condition number.
+*        Here we just remove the underflowed part of the triangular
+*        factor. This prevents the situation in which the code is
+*        working hard to get the accuracy not warranted by the data.
+         TEMP1  = SQRT(SFMIN)
+         DO 3301 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. SMALL ) .OR.
+     $           ( L2KILL .AND. (ABS(A(p,p)) .LT. TEMP1) ) ) GO TO 3302
+            NR = NR + 1
+ 3301    CONTINUE
+ 3302    CONTINUE
+*
+      END IF
+*
+      ALMORT = .FALSE.
+      IF ( NR .EQ. N ) THEN
+         MAXPRJ = ONE
+         DO 3051 p = 2, N
+            TEMP1  = ABS(A(p,p)) / SVA(IWORK(p))
+            MAXPRJ = MIN( MAXPRJ, TEMP1 )
+ 3051    CONTINUE
+         IF ( MAXPRJ**2 .GE. ONE - DBLE(N)*EPSLN ) ALMORT = .TRUE.
+      END IF
+*
+*
+      SCONDA = - ONE
+      CONDR1 = - ONE
+      CONDR2 = - ONE
+*
+      IF ( ERREST ) THEN
+         IF ( N .EQ. NR ) THEN
+            IF ( RSVEC ) THEN
+*              .. V is available as workspace
+               CALL ZLACPY( 'U', N, N, A, LDA, V, LDV )
+               DO 3053 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL ZDSCAL( p, ONE/TEMP1, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( LSVEC )THEN
+                   CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK(N+1), RWORK, IERR )
+               ELSE
+                   CALL ZPOCON( 'U', N, V, LDV, ONE, TEMP1,
+     $                  CWORK, RWORK, IERR )
+               END IF               
+*          
+            ELSE IF ( LSVEC ) THEN
+*              .. U is available as workspace
+               CALL ZLACPY( 'U', N, N, A, LDA, U, LDU )
+               DO 3054 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+                  CALL ZDSCAL( p, ONE/TEMP1, U(1,p), 1 )
+ 3054          CONTINUE
+               CALL ZPOCON( 'U', N, U, LDU, ONE, TEMP1,
+     $              CWORK(N+1), RWORK, IERR )
+            ELSE
+               CALL ZLACPY( 'U', N, N, A, LDA, CWORK, N )
+*[]            CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+*              Change: here index shifted by N to the left, CWORK(1:N) 
+*              not needed for SIGMA only computation
+               DO 3052 p = 1, N
+                  TEMP1 = SVA(IWORK(p))
+*[]               CALL ZDSCAL( p, ONE/TEMP1, CWORK(N+(p-1)*N+1), 1 )
+                  CALL ZDSCAL( p, ONE/TEMP1, CWORK((p-1)*N+1), 1 )
+ 3052          CONTINUE
+*           .. the columns of R are scaled to have unit Euclidean lengths.
+*[]               CALL ZPOCON( 'U', N, CWORK(N+1), N, ONE, TEMP1,
+*[]     $              CWORK(N+N*N+1), RWORK, IERR )
+               CALL ZPOCON( 'U', N, CWORK, N, ONE, TEMP1,
+     $              CWORK(N*N+1), RWORK, IERR )               
+*              
+            END IF
+            IF ( TEMP1 .NE. ZERO ) THEN 
+               SCONDA = ONE / SQRT(TEMP1)
+            ELSE
+               SCONDA = - ONE
+            END IF
+*           SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+         ELSE
+            SCONDA = - ONE
+         END IF
+      END IF
+*
+      L2PERT = L2PERT .AND. ( ABS( A(1,1)/A(NR,NR) ) .GT. SQRT(BIG1) )
+*     If there is no violent scaling, artificial perturbation is not needed.
+*
+*     Phase 3:
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*
+*         Singular Values only
+*
+*         .. transpose A(1:NR,1:N)
+         DO 1946 p = 1, MIN( N-1, NR )
+            CALL ZCOPY( N-p, A(p,p+1), LDA, A(p+1,p), 1 )
+            CALL ZLACGV( N-p+1, A(p,p), 1 )
+ 1946    CONTINUE
+         IF ( NR .EQ. N ) A(N,N) = CONJG(A(N,N))
+*
+*        The following two DO-loops introduce small relative perturbation
+*        into the strict upper triangle of the lower triangular matrix.
+*        Small entries below the main diagonal are also changed.
+*        This modification is useful if the computing environment does not
+*        provide/allow FLUSH TO ZERO underflow, for it prevents many
+*        annoying denormalized numbers in case of strongly scaled matrices.
+*        The perturbation is structured so that it does not introduce any
+*        new perturbation of the singular values, and it does not destroy
+*        the job done by the preconditioner.
+*        The licence for this perturbation is in the variable L2PERT, which
+*        should be .FALSE. if FLUSH TO ZERO underflow is active.
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 4947 q = 1, NR
+                  CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 4949 p = 1, N
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                    .OR. ( p .LT. q ) )
+*     $                     A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                     A(p,q) = CTEMP
+ 4949             CONTINUE
+ 4947          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, A(1,2),LDA )
+            END IF
+*
+*            .. second preconditioning using the QR factorization
+*
+            CALL ZGEQRF( N,NR, A,LDA, CWORK, CWORK(N+1),LWORK-N, IERR )
+*
+*           .. and transpose upper to lower triangular
+            DO 1948 p = 1, NR - 1
+               CALL ZCOPY( NR-p, A(p,p+1), LDA, A(p+1,p), 1 )
+               CALL ZLACGV( NR-p+1, A(p,p), 1 )
+ 1948       CONTINUE
+*
+      END IF
+*
+*           Row-cyclic Jacobi SVD algorithm with column pivoting
+*
+*           .. again some perturbation (a "background noise") is added
+*           to drown denormals
+            IF ( L2PERT ) THEN
+*              XSC = SQRT(SMALL)
+               XSC = EPSLN / DBLE(N)
+               DO 1947 q = 1, NR
+                  CTEMP = DCMPLX(XSC*ABS(A(q,q)),ZERO)
+                  DO 1949 p = 1, NR
+                     IF ( ( (p.GT.q) .AND. (ABS(A(p,q)).LE.TEMP1) )
+     $                       .OR. ( p .LT. q ) )
+*     $                   A(p,q) = TEMP1 * ( A(p,q) / ABS(A(p,q)) )
+     $                   A(p,q) = CTEMP
+ 1949             CONTINUE
+ 1947          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, A(1,2), LDA )
+            END IF
+*
+*           .. and one-sided Jacobi rotations are started on a lower
+*           triangular matrix (plus perturbation which is ignored in
+*           the part which destroys triangular form (confusing?!))
+*
+            CALL ZGESVJ( 'L', 'N', 'N', NR, NR, A, LDA, SVA,
+     $                N, V, LDV, CWORK, LWORK, RWORK, LRWORK, INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+*
+*
+      ELSE IF ( ( RSVEC .AND. ( .NOT. LSVEC ) .AND. ( .NOT. JRACC ) )
+     $       .OR. 
+     $   ( JRACC .AND. ( .NOT. LSVEC ) .AND. ( NR .NE. N ) ) ) THEN
+*
+*        -> Singular Values and Right Singular Vectors <-
+*
+         IF ( ALMORT ) THEN
+*
+*           .. in this case NR equals N
+            DO 1998 p = 1, NR
+               CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL ZLACGV( N-p+1, V(p,p), 1 )
+ 1998       CONTINUE
+            CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+*
+            CALL ZGESVJ( 'L','U','N', N, NR, V, LDV, SVA, NR, A, LDA,
+     $                  CWORK, LWORK, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+
+         ELSE
+*
+*        .. two more QR factorizations ( one QRF is not enough, two require
+*        accumulated product of Jacobi rotations, three are perfect )
+*
+            CALL ZLASET( 'L', NR-1,NR-1, CZERO, CZERO, A(2,1), LDA )
+            CALL ZGELQF( NR,N, A, LDA, CWORK, CWORK(N+1), LWORK-N, IERR)
+            CALL ZLACPY( 'L', NR, NR, A, LDA, V, LDV )
+            CALL ZLASET( 'U', NR-1,NR-1, CZERO, CZERO, V(1,2), LDV )
+            CALL ZGEQRF( NR, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $                   LWORK-2*N, IERR )
+            DO 8998 p = 1, NR
+               CALL ZCOPY( NR-p+1, V(p,p), LDV, V(p,p), 1 )
+               CALL ZLACGV( NR-p+1, V(p,p), 1 )
+ 8998       CONTINUE
+            CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV)
+*
+            CALL ZGESVJ( 'L', 'U','N', NR, NR, V,LDV, SVA, NR, U,
+     $                  LDU, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            IF ( NR .LT. N ) THEN
+               CALL ZLASET( 'A',N-NR, NR, CZERO,CZERO, V(NR+1,1),  LDV )
+               CALL ZLASET( 'A',NR, N-NR, CZERO,CZERO, V(1,NR+1),  LDV )
+               CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE, V(NR+1,NR+1),LDV )
+            END IF
+*
+         CALL ZUNMLQ( 'L', 'C', N, N, NR, A, LDA, CWORK,
+     $               V, LDV, CWORK(N+1), LWORK-N, IERR )
+*
+         END IF
+*         .. permute the rows of V
+*         DO 8991 p = 1, N
+*            CALL ZCOPY( N, V(p,1), LDV, A(IWORK(p),1), LDA )
+* 8991    CONTINUE
+*         CALL ZLACPY( 'All', N, N, A, LDA, V, LDV )
+         CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+          IF ( TRANSP ) THEN
+            CALL ZLACPY( 'A', N, N, V, LDV, U, LDU )
+          END IF
+*
+      ELSE IF ( JRACC .AND. (.NOT. LSVEC) .AND. ( NR.EQ. N ) ) THEN 
+*          
+         CALL ZLASET( 'L', N-1,N-1, CZERO, CZERO, A(2,1), LDA )
+*
+         CALL ZGESVJ( 'U','N','V', N, N, A, LDA, SVA, N, V, LDV,
+     $               CWORK, LWORK, RWORK, LRWORK, INFO )
+          SCALEM  = RWORK(1)
+          NUMRANK = NINT(RWORK(2))
+          CALL ZLAPMR( .FALSE., N, N, V, LDV, IWORK )
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC ) ) THEN
+*
+*        .. Singular Values and Left Singular Vectors                 ..
+*
+*        .. second preconditioning step to avoid need to accumulate
+*        Jacobi rotations in the Jacobi iterations.
+         DO 1965 p = 1, NR
+            CALL ZCOPY( N-p+1, A(p,p), LDA, U(p,p), 1 )
+            CALL ZLACGV( N-p+1, U(p,p), 1 )
+ 1965    CONTINUE
+         CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL ZGEQRF( N, NR, U, LDU, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+         DO 1967 p = 1, NR - 1
+            CALL ZCOPY( NR-p, U(p,p+1), LDU, U(p+1,p), 1 )
+            CALL ZLACGV( N-p+1, U(p,p), 1 )
+ 1967    CONTINUE
+         CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+*
+         CALL ZGESVJ( 'L', 'U', 'N', NR,NR, U, LDU, SVA, NR, A,
+     $        LDA, CWORK(N+1), LWORK-N, RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+*
+         IF ( NR .LT. M ) THEN
+            CALL ZLASET( 'A',  M-NR, NR,CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL ZLASET( 'A',NR, N1-NR, CZERO, CZERO, U(1,NR+1),LDU )
+               CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,U(NR+1,NR+1),LDU )
+            END IF
+         END IF
+*
+         CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $               LDU, CWORK(N+1), LWORK-N, IERR )
+*
+         IF ( ROWPIV )
+     $       CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         DO 1974 p = 1, N1
+            XSC = ONE / DZNRM2( M, U(1,p), 1 )
+            CALL ZDSCAL( M, XSC, U(1,p), 1 )
+ 1974    CONTINUE
+*
+         IF ( TRANSP ) THEN
+            CALL ZLACPY( 'A', N, N, U, LDU, V, LDV )
+         END IF
+*
+      ELSE
+*
+*        .. Full SVD ..
+*
+         IF ( .NOT. JRACC ) THEN
+*
+         IF ( .NOT. ALMORT ) THEN
+*
+*           Second Preconditioning Step (QRF [with pivoting])
+*           Note that the composition of TRANSPOSE, QRF and TRANSPOSE is
+*           equivalent to an LQF CALL. Since in many libraries the QRF
+*           seems to be better optimized than the LQF, we do explicit
+*           transpose and use the QRF. This is subject to changes in an
+*           optimized implementation of ZGEJSV.
+*
+            DO 1968 p = 1, NR
+               CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+               CALL ZLACGV( N-p+1, V(p,p), 1 )
+ 1968       CONTINUE
+*
+*           .. the following two loops perturb small entries to avoid
+*           denormals in the second QR factorization, where they are
+*           as good as zeros. This is done to avoid painfully slow
+*           computation with denormals. The relative size of the perturbation
+*           is a parameter that can be changed by the implementer.
+*           This perturbation device will be obsolete on machines with
+*           properly implemented arithmetic.
+*           To switch it off, set L2PERT=.FALSE. To remove it from  the
+*           code, remove the action under L2PERT=.TRUE., leave the ELSE part.
+*           The following two loops should be blocked and fused with the
+*           transposed copy above.
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 2969 q = 1, NR
+                  CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO)
+                  DO 2968 p = 1, N
+                     IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                   .OR. ( p .LT. q ) )
+*     $                   V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                   V(p,q) = CTEMP
+                     IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 2968             CONTINUE
+ 2969          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+            END IF
+*
+*           Estimate the row scaled condition number of R1
+*           (If R1 is rectangular, N > NR, then the condition number
+*           of the leading NR x NR submatrix is estimated.)
+*
+            CALL ZLACPY( 'L', NR, NR, V, LDV, CWORK(2*N+1), NR )
+            DO 3950 p = 1, NR
+               TEMP1 = DZNRM2(NR-p+1,CWORK(2*N+(p-1)*NR+p),1)
+               CALL ZDSCAL(NR-p+1,ONE/TEMP1,CWORK(2*N+(p-1)*NR+p),1)
+ 3950       CONTINUE
+            CALL ZPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1,
+     $                   CWORK(2*N+NR*NR+1),RWORK,IERR)
+            CONDR1 = ONE / SQRT(TEMP1)
+*           .. here need a second opinion on the condition number
+*           .. then assume worst case scenario
+*           R1 is OK for inverse <=> CONDR1 .LT. DBLE(N)
+*           more conservative    <=> CONDR1 .LT. SQRT(DBLE(N))
+*
+            COND_OK = SQRT(SQRT(DBLE(NR)))
+*[TP]       COND_OK is a tuning parameter.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*              .. the second QRF without pivoting. Note: in an optimized
+*              implementation, this QRF should be implemented as the QRF
+*              of a lower triangular matrix.
+*              R1^* = Q2 * R2
+               CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $              LWORK-2*N, IERR )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)/EPSLN
+                  DO 3959 p = 2, NR
+                     DO 3958 q = 1, p - 1
+                        CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                              ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3958                CONTINUE
+ 3959             CONTINUE
+               END IF
+*
+               IF ( NR .NE. N )
+     $         CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*              .. save ...
+*
+*           .. this transposed copy should be better than naive
+               DO 1969 p = 1, NR - 1
+                  CALL ZCOPY( NR-p, V(p,p+1), LDV, V(p+1,p), 1 )
+                  CALL ZLACGV(NR-p+1, V(p,p), 1 )
+ 1969          CONTINUE
+               V(NR,NR)=CONJG(V(NR,NR))
+*
+               CONDR2 = CONDR1
+*
+            ELSE
+*
+*              .. ill-conditioned case: second QRF with pivoting
+*              Note that windowed pivoting would be equally good
+*              numerically, and more run-time efficient. So, in
+*              an optimal implementation, the next call to ZGEQP3
+*              should be replaced with eg. CALL ZGEQPX (ACM TOMS #782)
+*              with properly (carefully) chosen parameters.
+*
+*              R1^* * P2 = Q2 * R2
+               DO 3003 p = 1, NR
+                  IWORK(N+p) = 0
+ 3003          CONTINUE
+               CALL ZGEQP3( N, NR, V, LDV, IWORK(N+1), CWORK(N+1),
+     $                  CWORK(2*N+1), LWORK-2*N, RWORK, IERR )
+**               CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+**     $              LWORK-2*N, IERR )
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 3969 p = 2, NR
+                     DO 3968 q = 1, p - 1
+                        CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                                ZERO)
+                        IF ( ABS(V(q,p)) .LE. TEMP1 )
+*     $                     V(q,p) = TEMP1 * ( V(q,p) / ABS(V(q,p)) )
+     $                     V(q,p) = CTEMP
+ 3968                CONTINUE
+ 3969             CONTINUE
+               END IF
+*
+               CALL ZLACPY( 'A', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+               IF ( L2PERT ) THEN
+                  XSC = SQRT(SMALL)
+                  DO 8970 p = 2, NR
+                     DO 8971 q = 1, p - 1
+                        CTEMP=DCMPLX(XSC*MIN(ABS(V(p,p)),ABS(V(q,q))),
+     $                               ZERO)
+*                        V(p,q) = - TEMP1*( V(q,p) / ABS(V(q,p)) )
+                        V(p,q) = - CTEMP
+ 8971                CONTINUE
+ 8970             CONTINUE
+               ELSE
+                  CALL ZLASET( 'L',NR-1,NR-1,CZERO,CZERO,V(2,1),LDV )
+               END IF
+*              Now, compute R2 = L3 * Q3, the LQ factorization.
+               CALL ZGELQF( NR, NR, V, LDV, CWORK(2*N+N*NR+1),
+     $               CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR, IERR )
+*              .. and estimate the condition number
+               CALL ZLACPY( 'L',NR,NR,V,LDV,CWORK(2*N+N*NR+NR+1),NR )
+               DO 4950 p = 1, NR
+                  TEMP1 = DZNRM2( p, CWORK(2*N+N*NR+NR+p), NR )
+                  CALL ZDSCAL( p, ONE/TEMP1, CWORK(2*N+N*NR+NR+p), NR )
+ 4950          CONTINUE
+               CALL ZPOCON( 'L',NR,CWORK(2*N+N*NR+NR+1),NR,ONE,TEMP1,
+     $              CWORK(2*N+N*NR+NR+NR*NR+1),RWORK,IERR )
+               CONDR2 = ONE / SQRT(TEMP1)
+*
+*
+               IF ( CONDR2 .GE. COND_OK ) THEN
+*                 .. save the Householder vectors used for Q3
+*                 (this overwrites the copy of R2, as it will not be
+*                 needed in this branch, but it does not overwritte the
+*                 Huseholder vectors of Q2.).
+                  CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
+*                 .. and the rest of the information on Q3 is in
+*                 WORK(2*N+N*NR+1:2*N+N*NR+N)
+               END IF
+*
+            END IF
+*
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 4968 q = 2, NR
+                  CTEMP = XSC * V(q,q)
+                  DO 4969 p = 1, q - 1
+*                     V(p,q) = - TEMP1*( V(p,q) / ABS(V(p,q)) )
+                     V(p,q) = - CTEMP
+ 4969             CONTINUE
+ 4968          CONTINUE
+            ELSE
+               CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+            END IF
+*
+*        Second preconditioning finished; continue with Jacobi SVD
+*        The input matrix is lower trinagular.
+*
+*        Recover the right singular vectors as solution of a well
+*        conditioned triangular matrix equation.
+*
+            IF ( CONDR1 .LT. COND_OK ) THEN
+*
+               CALL ZGESVJ( 'L','U','N',NR,NR,V,LDV,SVA,NR,U, LDU,
+     $              CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,RWORK,
+     $              LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3970 p = 1, NR
+                  CALL ZCOPY(  NR, V(1,p), 1, U(1,p), 1 )
+                  CALL ZDSCAL( NR, SVA(p),    V(1,p), 1 )
+ 3970          CONTINUE
+
+*        .. pick the right matrix equation and solve it
+*
+               IF ( NR .EQ. N ) THEN
+* :))             .. best case, R1 is inverted. The solution of this matrix
+*                 equation is Q2*V2 = the product of the Jacobi rotations
+*                 used in ZGESVJ, premultiplied with the orthogonal matrix
+*                 from the second QR factorization.
+                  CALL ZTRSM('L','U','N','N', NR,NR,CONE, A,LDA, V,LDV)
+               ELSE
+*                 .. R1 is well conditioned, but non-square. Adjoint of R2
+*                 is inverted to get the product of the Jacobi rotations
+*                 used in ZGESVJ. The Q-factor from the second QR
+*                 factorization is then built in explicitly.
+                  CALL ZTRSM('L','U','C','N',NR,NR,CONE,CWORK(2*N+1),
+     $                 N,V,LDV)
+                  IF ( NR .LT. N ) THEN
+                  CALL ZLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL ZLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  END IF
+                  CALL ZUNMQR('L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $                V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR)
+               END IF
+*
+            ELSE IF ( CONDR2 .LT. COND_OK ) THEN
+*
+*              The matrix R2 is inverted. The solution of the matrix equation
+*              is Q3^* * V3 = the product of the Jacobi rotations (appplied to
+*              the lower triangular L3 from the LQ factorization of
+*              R2=L3*Q3), pre-multiplied with the transposed Q3.
+               CALL ZGESVJ( 'L', 'U', 'N', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $              RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               DO 3870 p = 1, NR
+                  CALL ZCOPY( NR, V(1,p), 1, U(1,p), 1 )
+                  CALL ZDSCAL( NR, SVA(p),    U(1,p), 1 )
+ 3870          CONTINUE
+               CALL ZTRSM('L','U','N','N',NR,NR,CONE,CWORK(2*N+1),N,
+     $                    U,LDU)
+*              .. apply the permutation from the second QR factorization
+               DO 873 q = 1, NR
+                  DO 872 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 872              CONTINUE
+                  DO 874 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 874              CONTINUE
+ 873           CONTINUE
+               IF ( NR .LT. N ) THEN
+                  CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+            ELSE
+*              Last line of defense.
+* #:(          This is a rather pathological case: no scaled condition
+*              improvement after two pivoted QR factorizations. Other
+*              possibility is that the rank revealing QR factorization
+*              or the condition estimator has failed, or the COND_OK
+*              is set very close to ONE (which is unnecessary). Normally,
+*              this branch should never be executed, but in rare cases of
+*              failure of the RRQR or condition estimator, the last line of
+*              defense ensures that ZGEJSV completes the task.
+*              Compute the full SVD of L3 using ZGESVJ with explicit
+*              accumulation of Jacobi rotations.
+               CALL ZGESVJ( 'L', 'U', 'V', NR, NR, V, LDV, SVA, NR, U,
+     $              LDU, CWORK(2*N+N*NR+NR+1), LWORK-2*N-N*NR-NR,
+     $                         RWORK, LRWORK, INFO )
+               SCALEM  = RWORK(1)
+               NUMRANK = NINT(RWORK(2))
+               IF ( NR .LT. N ) THEN
+                  CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+                  CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+               END IF
+               CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $              V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+               CALL ZUNMLQ( 'L', 'C', NR, NR, NR, CWORK(2*N+1), N,
+     $              CWORK(2*N+N*NR+1), U, LDU, CWORK(2*N+N*NR+NR+1),
+     $              LWORK-2*N-N*NR-NR, IERR )
+               DO 773 q = 1, NR
+                  DO 772 p = 1, NR
+                     CWORK(2*N+N*NR+NR+IWORK(N+p)) = U(p,q)
+ 772              CONTINUE
+                  DO 774 p = 1, NR
+                     U(p,q) = CWORK(2*N+N*NR+NR+p)
+ 774              CONTINUE
+ 773           CONTINUE
+*
+            END IF
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(DBLE(N)) * EPSLN
+            DO 1972 q = 1, N
+               DO 972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+  972          CONTINUE
+               DO 973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+  973          CONTINUE
+               XSC = ONE / DZNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL ZDSCAL( N, XSC, V(1,q), 1 )
+ 1972       CONTINUE
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+            IF ( NR .LT. M ) THEN
+               CALL ZLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU)
+               IF ( NR .LT. N1 ) THEN
+                  CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                  CALL ZLASET('A',M-NR,N1-NR,CZERO,CONE,
+     $                        U(NR+1,NR+1),LDU)
+               END IF
+            END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           matrix U. This applies to all cases.
+*
+            CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+
+*           The columns of U are normalized. The cost is O(M*N) flops.
+            TEMP1 = SQRT(DBLE(M)) * EPSLN
+            DO 1973 p = 1, NR
+               XSC = ONE / DZNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $          CALL ZDSCAL( M, XSC, U(1,p), 1 )
+ 1973       CONTINUE
+*
+*           If the initial QRF is computed with row pivoting, the left
+*           singular vectors must be adjusted.
+*
+            IF ( ROWPIV )
+     $          CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         ELSE
+*
+*        .. the initial matrix A has almost orthogonal columns and
+*        the second QRF is not needed
+*
+            CALL ZLACPY( 'U', N, N, A, LDA, CWORK(N+1), N )
+            IF ( L2PERT ) THEN
+               XSC = SQRT(SMALL)
+               DO 5970 p = 2, N
+                  CTEMP = XSC * CWORK( N + (p-1)*N + p )
+                  DO 5971 q = 1, p - 1
+*                     CWORK(N+(q-1)*N+p)=-TEMP1 * ( CWORK(N+(p-1)*N+q) /
+*     $                                        ABS(CWORK(N+(p-1)*N+q)) )
+                     CWORK(N+(q-1)*N+p)=-CTEMP
+ 5971             CONTINUE
+ 5970          CONTINUE
+            ELSE
+               CALL ZLASET( 'L',N-1,N-1,CZERO,CZERO,CWORK(N+2),N )
+            END IF
+*
+            CALL ZGESVJ( 'U', 'U', 'N', N, N, CWORK(N+1), N, SVA,
+     $           N, U, LDU, CWORK(N+N*N+1), LWORK-N-N*N, RWORK, LRWORK,
+     $       INFO )
+*
+            SCALEM  = RWORK(1)
+            NUMRANK = NINT(RWORK(2))
+            DO 6970 p = 1, N
+               CALL ZCOPY( N, CWORK(N+(p-1)*N+1), 1, U(1,p), 1 )
+               CALL ZDSCAL( N, SVA(p), CWORK(N+(p-1)*N+1), 1 )
+ 6970       CONTINUE
+*
+            CALL ZTRSM( 'L', 'U', 'N', 'N', N, N,
+     $           CONE, A, LDA, CWORK(N+1), N )
+            DO 6972 p = 1, N
+               CALL ZCOPY( N, CWORK(N+p), N, V(IWORK(p),1), LDV )
+ 6972       CONTINUE
+            TEMP1 = SQRT(DBLE(N))*EPSLN
+            DO 6971 p = 1, N
+               XSC = ONE / DZNRM2( N, V(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL ZDSCAL( N, XSC, V(1,p), 1 )
+ 6971       CONTINUE
+*
+*           Assemble the left singular vector matrix U (M x N).
+*
+            IF ( N .LT. M ) THEN
+               CALL ZLASET( 'A',  M-N, N, CZERO, CZERO, U(N+1,1), LDU )
+               IF ( N .LT. N1 ) THEN
+                  CALL ZLASET('A',N,  N1-N, CZERO, CZERO,  U(1,N+1),LDU)
+                  CALL ZLASET( 'A',M-N,N1-N, CZERO, CONE,U(N+1,N+1),LDU)
+               END IF
+            END IF
+            CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $           LDU, CWORK(N+1), LWORK-N, IERR )
+            TEMP1 = SQRT(DBLE(M))*EPSLN
+            DO 6973 p = 1, N1
+               XSC = ONE / DZNRM2( M, U(1,p), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $            CALL ZDSCAL( M, XSC, U(1,p), 1 )
+ 6973       CONTINUE
+*
+            IF ( ROWPIV )
+     $         CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+         END IF
+*
+*        end of the  >> almost orthogonal case <<  in the full SVD
+*
+         ELSE
+*
+*        This branch deploys a preconditioned Jacobi SVD with explicitly
+*        accumulated rotations. It is included as optional, mainly for
+*        experimental purposes. It does perform well, and can also be used.
+*        In this implementation, this branch will be automatically activated
+*        if the  condition number sigma_max(A) / sigma_min(A) is predicted
+*        to be greater than the overflow threshold. This is because the
+*        a posteriori computation of the singular vectors assumes robust
+*        implementation of BLAS and some LAPACK procedures, capable of working
+*        in presence of extreme values, e.g. when the singular values spread from
+*        the underflow to the overflow threshold. 
+*
+         DO 7968 p = 1, NR
+            CALL ZCOPY( N-p+1, A(p,p), LDA, V(p,p), 1 )
+            CALL ZLACGV( N-p+1, V(p,p), 1 )
+ 7968    CONTINUE
+*
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 5969 q = 1, NR
+               CTEMP = DCMPLX(XSC*ABS( V(q,q) ),ZERO)
+               DO 5968 p = 1, N
+                  IF ( ( p .GT. q ) .AND. ( ABS(V(p,q)) .LE. TEMP1 )
+     $                .OR. ( p .LT. q ) )
+*     $                V(p,q) = TEMP1 * ( V(p,q) / ABS(V(p,q)) )
+     $                V(p,q) = CTEMP
+                  IF ( p .LT. q ) V(p,q) = - V(p,q)
+ 5968          CONTINUE
+ 5969       CONTINUE
+         ELSE
+            CALL ZLASET( 'U', NR-1, NR-1, CZERO, CZERO, V(1,2), LDV )
+         END IF
+
+         CALL ZGEQRF( N, NR, V, LDV, CWORK(N+1), CWORK(2*N+1),
+     $        LWORK-2*N, IERR )
+         CALL ZLACPY( 'L', N, NR, V, LDV, CWORK(2*N+1), N )
+*
+         DO 7969 p = 1, NR
+            CALL ZCOPY( NR-p+1, V(p,p), LDV, U(p,p), 1 )
+            CALL ZLACGV( NR-p+1, U(p,p), 1 )
+ 7969    CONTINUE
+
+         IF ( L2PERT ) THEN
+            XSC = SQRT(SMALL/EPSLN)
+            DO 9970 q = 2, NR
+               DO 9971 p = 1, q - 1
+                  CTEMP = DCMPLX(XSC * MIN(ABS(U(p,p)),ABS(U(q,q))),
+     $                            ZERO)
+*                  U(p,q) = - TEMP1 * ( U(q,p) / ABS(U(q,p)) )
+                  U(p,q) = - CTEMP
+ 9971          CONTINUE
+ 9970       CONTINUE
+         ELSE
+            CALL ZLASET('U', NR-1, NR-1, CZERO, CZERO, U(1,2), LDU )
+         END IF
+
+         CALL ZGESVJ( 'L', 'U', 'V', NR, NR, U, LDU, SVA,
+     $        N, V, LDV, CWORK(2*N+N*NR+1), LWORK-2*N-N*NR,
+     $         RWORK, LRWORK, INFO )
+         SCALEM  = RWORK(1)
+         NUMRANK = NINT(RWORK(2))
+
+         IF ( NR .LT. N ) THEN
+            CALL ZLASET( 'A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV )
+            CALL ZLASET( 'A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV )
+            CALL ZLASET( 'A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV )
+         END IF
+
+         CALL ZUNMQR( 'L','N',N,N,NR,CWORK(2*N+1),N,CWORK(N+1),
+     $        V,LDV,CWORK(2*N+N*NR+NR+1),LWORK-2*N-N*NR-NR,IERR )
+*
+*           Permute the rows of V using the (column) permutation from the
+*           first QRF. Also, scale the columns to make them unit in
+*           Euclidean norm. This applies to all cases.
+*
+            TEMP1 = SQRT(DBLE(N)) * EPSLN
+            DO 7972 q = 1, N
+               DO 8972 p = 1, N
+                  CWORK(2*N+N*NR+NR+IWORK(p)) = V(p,q)
+ 8972          CONTINUE
+               DO 8973 p = 1, N
+                  V(p,q) = CWORK(2*N+N*NR+NR+p)
+ 8973          CONTINUE
+               XSC = ONE / DZNRM2( N, V(1,q), 1 )
+               IF ( (XSC .LT. (ONE-TEMP1)) .OR. (XSC .GT. (ONE+TEMP1)) )
+     $           CALL ZDSCAL( N, XSC, V(1,q), 1 )
+ 7972       CONTINUE
+*
+*           At this moment, V contains the right singular vectors of A.
+*           Next, assemble the left singular vector matrix U (M x N).
+*
+         IF ( NR .LT. M ) THEN
+            CALL ZLASET( 'A',  M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU )
+            IF ( NR .LT. N1 ) THEN
+               CALL ZLASET('A',NR,  N1-NR, CZERO, CZERO,  U(1,NR+1),LDU)
+               CALL ZLASET('A',M-NR,N1-NR, CZERO, CONE,U(NR+1,NR+1),LDU)
+            END IF
+         END IF
+*
+         CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $        LDU, CWORK(N+1), LWORK-N, IERR )
+*
+            IF ( ROWPIV )
+     $         CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(IWOFF+1), -1 )
+*
+*
+         END IF
+         IF ( TRANSP ) THEN
+*           .. swap U and V because the procedure worked on A^*
+            DO 6974 p = 1, N
+               CALL ZSWAP( N, U(1,p), 1, V(1,p), 1 )
+ 6974       CONTINUE
+         END IF
+*
+      END IF
+*     end of the full SVD
+*
+*     Undo scaling, if necessary (and possible)
+*
+      IF ( USCAL2 .LE. (BIG/SVA(1))*USCAL1 ) THEN
+         CALL DLASCL( 'G', 0, 0, USCAL1, USCAL2, NR, 1, SVA, N, IERR )
+         USCAL1 = ONE
+         USCAL2 = ONE
+      END IF
+*
+      IF ( NR .LT. N ) THEN
+         DO 3004 p = NR+1, N
+            SVA(p) = ZERO
+ 3004    CONTINUE
+      END IF
+*
+      RWORK(1) = USCAL2 * SCALEM
+      RWORK(2) = USCAL1
+      IF ( ERREST ) RWORK(3) = SCONDA
+      IF ( LSVEC .AND. RSVEC ) THEN
+         RWORK(4) = CONDR1
+         RWORK(5) = CONDR2
+      END IF
+      IF ( L2TRAN ) THEN
+         RWORK(6) = ENTRA
+         RWORK(7) = ENTRAT
+      END IF
+*
+      IWORK(1) = NR
+      IWORK(2) = NUMRANK
+      IWORK(3) = WARNING
+      IF ( TRANSP ) THEN
+          IWORK(4) =  1 
+      ELSE
+          IWORK(4) = -1
+      END IF 
+      
+*
+      RETURN
+*     ..
+*     .. END OF ZGEJSV
+*     ..
+      END
+*
diff --git a/lapack-netlib/zgesvx.f b/lapack-netlib/zgesvx.f
new file mode 100644
index 000000000..3b193a1b2
--- /dev/null
+++ b/lapack-netlib/zgesvx.f
@@ -0,0 +1,602 @@
+*> \brief <b> ZGESVX computes the solution to system of linear equations A * X = B for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGESVX + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgesvx.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgesvx.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgesvx.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+*                          EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+*                          WORK, RWORK, INFO )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER          EQUED, FACT, TRANS
+*       INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+*       DOUBLE PRECISION   RCOND
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            IPIV( * )
+*       DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+*      $                   RWORK( * )
+*       COMPLEX*16         A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+*      $                   WORK( * ), X( LDX, * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZGESVX uses the LU factorization to compute the solution to a complex
+*> system of linear equations
+*>    A * X = B,
+*> where A is an N-by-N matrix and X and B are N-by-NRHS matrices.
+*>
+*> Error bounds on the solution and a condition estimate are also
+*> provided.
+*> \endverbatim
+*
+*> \par Description:
+*  =================
+*>
+*> \verbatim
+*>
+*> The following steps are performed:
+*>
+*> 1. If FACT = 'E', real scaling factors are computed to equilibrate
+*>    the system:
+*>       TRANS = 'N':  diag(R)*A*diag(C)     *inv(diag(C))*X = diag(R)*B
+*>       TRANS = 'T': (diag(R)*A*diag(C))**T *inv(diag(R))*X = diag(C)*B
+*>       TRANS = 'C': (diag(R)*A*diag(C))**H *inv(diag(R))*X = diag(C)*B
+*>    Whether or not the system will be equilibrated depends on the
+*>    scaling of the matrix A, but if equilibration is used, A is
+*>    overwritten by diag(R)*A*diag(C) and B by diag(R)*B (if TRANS='N')
+*>    or diag(C)*B (if TRANS = 'T' or 'C').
+*>
+*> 2. If FACT = 'N' or 'E', the LU decomposition is used to factor the
+*>    matrix A (after equilibration if FACT = 'E') as
+*>       A = P * L * U,
+*>    where P is a permutation matrix, L is a unit lower triangular
+*>    matrix, and U is upper triangular.
+*>
+*> 3. If some U(i,i)=0, so that U is exactly singular, then the routine
+*>    returns with INFO = i. Otherwise, the factored form of A is used
+*>    to estimate the condition number of the matrix A.  If the
+*>    reciprocal of the condition number is less than machine precision,
+*>    INFO = N+1 is returned as a warning, but the routine still goes on
+*>    to solve for X and compute error bounds as described below.
+*>
+*> 4. The system of equations is solved for X using the factored form
+*>    of A.
+*>
+*> 5. Iterative refinement is applied to improve the computed solution
+*>    matrix and calculate error bounds and backward error estimates
+*>    for it.
+*>
+*> 6. If equilibration was used, the matrix X is premultiplied by
+*>    diag(C) (if TRANS = 'N') or diag(R) (if TRANS = 'T' or 'C') so
+*>    that it solves the original system before equilibration.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] FACT
+*> \verbatim
+*>          FACT is CHARACTER*1
+*>          Specifies whether or not the factored form of the matrix A is
+*>          supplied on entry, and if not, whether the matrix A should be
+*>          equilibrated before it is factored.
+*>          = 'F':  On entry, AF and IPIV contain the factored form of A.
+*>                  If EQUED is not 'N', the matrix A has been
+*>                  equilibrated with scaling factors given by R and C.
+*>                  A, AF, and IPIV are not modified.
+*>          = 'N':  The matrix A will be copied to AF and factored.
+*>          = 'E':  The matrix A will be equilibrated if necessary, then
+*>                  copied to AF and factored.
+*> \endverbatim
+*>
+*> \param[in] TRANS
+*> \verbatim
+*>          TRANS is CHARACTER*1
+*>          Specifies the form of the system of equations:
+*>          = 'N':  A * X = B     (No transpose)
+*>          = 'T':  A**T * X = B  (Transpose)
+*>          = 'C':  A**H * X = B  (Conjugate transpose)
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of linear equations, i.e., the order of the
+*>          matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NRHS
+*> \verbatim
+*>          NRHS is INTEGER
+*>          The number of right hand sides, i.e., the number of columns
+*>          of the matrices B and X.  NRHS >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the N-by-N matrix A.  If FACT = 'F' and EQUED is
+*>          not 'N', then A must have been equilibrated by the scaling
+*>          factors in R and/or C.  A is not modified if FACT = 'F' or
+*>          'N', or if FACT = 'E' and EQUED = 'N' on exit.
+*>
+*>          On exit, if EQUED .ne. 'N', A is scaled as follows:
+*>          EQUED = 'R':  A := diag(R) * A
+*>          EQUED = 'C':  A := A * diag(C)
+*>          EQUED = 'B':  A := diag(R) * A * diag(C).
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] AF
+*> \verbatim
+*>          AF is COMPLEX*16 array, dimension (LDAF,N)
+*>          If FACT = 'F', then AF is an input argument and on entry
+*>          contains the factors L and U from the factorization
+*>          A = P*L*U as computed by ZGETRF.  If EQUED .ne. 'N', then
+*>          AF is the factored form of the equilibrated matrix A.
+*>
+*>          If FACT = 'N', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then AF is an output argument and on exit
+*>          returns the factors L and U from the factorization A = P*L*U
+*>          of the equilibrated matrix A (see the description of A for
+*>          the form of the equilibrated matrix).
+*> \endverbatim
+*>
+*> \param[in] LDAF
+*> \verbatim
+*>          LDAF is INTEGER
+*>          The leading dimension of the array AF.  LDAF >= max(1,N).
+*> \endverbatim
+*>
+*> \param[in,out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          If FACT = 'F', then IPIV is an input argument and on entry
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          as computed by ZGETRF; row i of the matrix was interchanged
+*>          with row IPIV(i).
+*>
+*>          If FACT = 'N', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the original matrix A.
+*>
+*>          If FACT = 'E', then IPIV is an output argument and on exit
+*>          contains the pivot indices from the factorization A = P*L*U
+*>          of the equilibrated matrix A.
+*> \endverbatim
+*>
+*> \param[in,out] EQUED
+*> \verbatim
+*>          EQUED is CHARACTER*1
+*>          Specifies the form of equilibration that was done.
+*>          = 'N':  No equilibration (always true if FACT = 'N').
+*>          = 'R':  Row equilibration, i.e., A has been premultiplied by
+*>                  diag(R).
+*>          = 'C':  Column equilibration, i.e., A has been postmultiplied
+*>                  by diag(C).
+*>          = 'B':  Both row and column equilibration, i.e., A has been
+*>                  replaced by diag(R) * A * diag(C).
+*>          EQUED is an input argument if FACT = 'F'; otherwise, it is an
+*>          output argument.
+*> \endverbatim
+*>
+*> \param[in,out] R
+*> \verbatim
+*>          R is DOUBLE PRECISION array, dimension (N)
+*>          The row scale factors for A.  If EQUED = 'R' or 'B', A is
+*>          multiplied on the left by diag(R); if EQUED = 'N' or 'C', R
+*>          is not accessed.  R is an input argument if FACT = 'F';
+*>          otherwise, R is an output argument.  If FACT = 'F' and
+*>          EQUED = 'R' or 'B', each element of R must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] C
+*> \verbatim
+*>          C is DOUBLE PRECISION array, dimension (N)
+*>          The column scale factors for A.  If EQUED = 'C' or 'B', A is
+*>          multiplied on the right by diag(C); if EQUED = 'N' or 'R', C
+*>          is not accessed.  C is an input argument if FACT = 'F';
+*>          otherwise, C is an output argument.  If FACT = 'F' and
+*>          EQUED = 'C' or 'B', each element of C must be positive.
+*> \endverbatim
+*>
+*> \param[in,out] B
+*> \verbatim
+*>          B is COMPLEX*16 array, dimension (LDB,NRHS)
+*>          On entry, the N-by-NRHS right hand side matrix B.
+*>          On exit,
+*>          if EQUED = 'N', B is not modified;
+*>          if TRANS = 'N' and EQUED = 'R' or 'B', B is overwritten by
+*>          diag(R)*B;
+*>          if TRANS = 'T' or 'C' and EQUED = 'C' or 'B', B is
+*>          overwritten by diag(C)*B.
+*> \endverbatim
+*>
+*> \param[in] LDB
+*> \verbatim
+*>          LDB is INTEGER
+*>          The leading dimension of the array B.  LDB >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] X
+*> \verbatim
+*>          X is COMPLEX*16 array, dimension (LDX,NRHS)
+*>          If INFO = 0 or INFO = N+1, the N-by-NRHS solution matrix X
+*>          to the original system of equations.  Note that A and B are
+*>          modified on exit if EQUED .ne. 'N', and the solution to the
+*>          equilibrated system is inv(diag(C))*X if TRANS = 'N' and
+*>          EQUED = 'C' or 'B', or inv(diag(R))*X if TRANS = 'T' or 'C'
+*>          and EQUED = 'R' or 'B'.
+*> \endverbatim
+*>
+*> \param[in] LDX
+*> \verbatim
+*>          LDX is INTEGER
+*>          The leading dimension of the array X.  LDX >= max(1,N).
+*> \endverbatim
+*>
+*> \param[out] RCOND
+*> \verbatim
+*>          RCOND is DOUBLE PRECISION
+*>          The estimate of the reciprocal condition number of the matrix
+*>          A after equilibration (if done).  If RCOND is less than the
+*>          machine precision (in particular, if RCOND = 0), the matrix
+*>          is singular to working precision.  This condition is
+*>          indicated by a return code of INFO > 0.
+*> \endverbatim
+*>
+*> \param[out] FERR
+*> \verbatim
+*>          FERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The estimated forward error bound for each solution vector
+*>          X(j) (the j-th column of the solution matrix X).
+*>          If XTRUE is the true solution corresponding to X(j), FERR(j)
+*>          is an estimated upper bound for the magnitude of the largest
+*>          element in (X(j) - XTRUE) divided by the magnitude of the
+*>          largest element in X(j).  The estimate is as reliable as
+*>          the estimate for RCOND, and is almost always a slight
+*>          overestimate of the true error.
+*> \endverbatim
+*>
+*> \param[out] BERR
+*> \verbatim
+*>          BERR is DOUBLE PRECISION array, dimension (NRHS)
+*>          The componentwise relative backward error of each solution
+*>          vector X(j) (i.e., the smallest relative change in
+*>          any element of A or B that makes X(j) an exact solution).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is COMPLEX*16 array, dimension (2*N)
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (MAX(1,2*N))
+*>          On exit, RWORK(1) contains the reciprocal pivot growth
+*>          factor norm(A)/norm(U). The "max absolute element" norm is
+*>          used. If RWORK(1) is much less than 1, then the stability
+*>          of the LU factorization of the (equilibrated) matrix A
+*>          could be poor. This also means that the solution X, condition
+*>          estimator RCOND, and forward error bound FERR could be
+*>          unreliable. If factorization fails with 0<INFO<=N, then
+*>          RWORK(1) contains the reciprocal pivot growth factor for the
+*>          leading INFO columns of A.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  if INFO = i, and i is
+*>                <= N:  U(i,i) is exactly zero.  The factorization has
+*>                       been completed, but the factor U is exactly
+*>                       singular, so the solution and error bounds
+*>                       could not be computed. RCOND = 0 is returned.
+*>                = N+1: U is nonsingular, but RCOND is less than machine
+*>                       precision, meaning that the matrix is singular
+*>                       to working precision.  Nevertheless, the
+*>                       solution and error bounds are computed because
+*>                       there are a number of situations where the
+*>                       computed solution can be more accurate than the
+*>                       value of RCOND would suggest.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \ingroup complex16GEsolve
+*
+*  =====================================================================
+      SUBROUTINE ZGESVX( FACT, TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV,
+     $                   EQUED, R, C, B, LDB, X, LDX, RCOND, FERR, BERR,
+     $                   WORK, RWORK, INFO )
+*
+*  -- LAPACK driver routine --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*
+*     .. Scalar Arguments ..
+      CHARACTER          EQUED, FACT, TRANS
+      INTEGER            INFO, LDA, LDAF, LDB, LDX, N, NRHS
+      DOUBLE PRECISION   RCOND
+*     ..
+*     .. Array Arguments ..
+      INTEGER            IPIV( * )
+      DOUBLE PRECISION   BERR( * ), C( * ), FERR( * ), R( * ),
+     $                   RWORK( * )
+      COMPLEX*16         A( LDA, * ), AF( LDAF, * ), B( LDB, * ),
+     $                   WORK( * ), X( LDX, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO, ONE
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            COLEQU, EQUIL, NOFACT, NOTRAN, ROWEQU
+      CHARACTER          NORM
+      INTEGER            I, INFEQU, J
+      DOUBLE PRECISION   AMAX, ANORM, BIGNUM, COLCND, RCMAX, RCMIN,
+     $                   ROWCND, RPVGRW, SMLNUM
+*     ..
+*     .. External Functions ..
+      LOGICAL            LSAME
+      DOUBLE PRECISION   DLAMCH, ZLANGE, ZLANTR
+      EXTERNAL           LSAME, DLAMCH, ZLANGE, ZLANTR
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           XERBLA, ZGECON, ZGEEQU, ZGERFS, ZGETRF, ZGETRS,
+     $                   ZLACPY, ZLAQGE
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+      INFO = 0
+      NOFACT = LSAME( FACT, 'N' )
+      EQUIL = LSAME( FACT, 'E' )
+      NOTRAN = LSAME( TRANS, 'N' )
+      IF( NOFACT .OR. EQUIL ) THEN
+         EQUED = 'N'
+         ROWEQU = .FALSE.
+         COLEQU = .FALSE.
+      ELSE
+         ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+         COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         SMLNUM = DLAMCH( 'Safe minimum' )
+         BIGNUM = ONE / SMLNUM
+      END IF
+*
+*     Test the input parameters.
+*
+      IF( .NOT.NOFACT .AND. .NOT.EQUIL .AND. .NOT.LSAME( FACT, 'F' ) )
+     $     THEN
+         INFO = -1
+      ELSE IF( .NOT.NOTRAN .AND. .NOT.LSAME( TRANS, 'T' ) .AND. .NOT.
+     $         LSAME( TRANS, 'C' ) ) THEN
+         INFO = -2
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -3
+      ELSE IF( NRHS.LT.0 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, N ) ) THEN
+         INFO = -6
+      ELSE IF( LDAF.LT.MAX( 1, N ) ) THEN
+         INFO = -8
+      ELSE IF( LSAME( FACT, 'F' ) .AND. .NOT.
+     $         ( ROWEQU .OR. COLEQU .OR. LSAME( EQUED, 'N' ) ) ) THEN
+         INFO = -10
+      ELSE
+         IF( ROWEQU ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 10 J = 1, N
+               RCMIN = MIN( RCMIN, R( J ) )
+               RCMAX = MAX( RCMAX, R( J ) )
+   10       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -11
+            ELSE IF( N.GT.0 ) THEN
+               ROWCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               ROWCND = ONE
+            END IF
+         END IF
+         IF( COLEQU .AND. INFO.EQ.0 ) THEN
+            RCMIN = BIGNUM
+            RCMAX = ZERO
+            DO 20 J = 1, N
+               RCMIN = MIN( RCMIN, C( J ) )
+               RCMAX = MAX( RCMAX, C( J ) )
+   20       CONTINUE
+            IF( RCMIN.LE.ZERO ) THEN
+               INFO = -12
+            ELSE IF( N.GT.0 ) THEN
+               COLCND = MAX( RCMIN, SMLNUM ) / MIN( RCMAX, BIGNUM )
+            ELSE
+               COLCND = ONE
+            END IF
+         END IF
+         IF( INFO.EQ.0 ) THEN
+            IF( LDB.LT.MAX( 1, N ) ) THEN
+               INFO = -14
+            ELSE IF( LDX.LT.MAX( 1, N ) ) THEN
+               INFO = -16
+            END IF
+         END IF
+      END IF
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGESVX', -INFO )
+         RETURN
+      END IF
+*
+      IF( EQUIL ) THEN
+*
+*        Compute row and column scalings to equilibrate the matrix A.
+*
+         CALL ZGEEQU( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX, INFEQU )
+         IF( INFEQU.EQ.0 ) THEN
+*
+*           Equilibrate the matrix.
+*
+            CALL ZLAQGE( N, N, A, LDA, R, C, ROWCND, COLCND, AMAX,
+     $                   EQUED )
+            ROWEQU = LSAME( EQUED, 'R' ) .OR. LSAME( EQUED, 'B' )
+            COLEQU = LSAME( EQUED, 'C' ) .OR. LSAME( EQUED, 'B' )
+         END IF
+      END IF
+*
+*     Scale the right hand side.
+*
+      IF( NOTRAN ) THEN
+         IF( ROWEQU ) THEN
+            DO 40 J = 1, NRHS
+               DO 30 I = 1, N
+                  B( I, J ) = R( I )*B( I, J )
+   30          CONTINUE
+   40       CONTINUE
+         END IF
+      ELSE IF( COLEQU ) THEN
+         DO 60 J = 1, NRHS
+            DO 50 I = 1, N
+               B( I, J ) = C( I )*B( I, J )
+   50       CONTINUE
+   60    CONTINUE
+      END IF
+*
+      IF( NOFACT .OR. EQUIL ) THEN
+*
+*        Compute the LU factorization of A.
+*
+         CALL ZLACPY( 'Full', N, N, A, LDA, AF, LDAF )
+         CALL ZGETRF( N, N, AF, LDAF, IPIV, INFO )
+*
+*        Return if INFO is non-zero.
+*
+         IF( INFO.GT.0 ) THEN
+*
+*           Compute the reciprocal pivot growth factor of the
+*           leading rank-deficient INFO columns of A.
+*
+            RPVGRW = ZLANTR( 'M', 'U', 'N', INFO, INFO, AF, LDAF,
+     $               RWORK )
+            IF( RPVGRW.EQ.ZERO ) THEN
+               RPVGRW = ONE
+            ELSE
+               RPVGRW = ZLANGE( 'M', N, INFO, A, LDA, RWORK ) /
+     $                  RPVGRW
+            END IF
+            RWORK( 1 ) = RPVGRW
+            RCOND = ZERO
+            RETURN
+         END IF
+      END IF
+*
+*     Compute the norm of the matrix A and the
+*     reciprocal pivot growth factor RPVGRW.
+*
+      IF( NOTRAN ) THEN
+         NORM = '1'
+      ELSE
+         NORM = 'I'
+      END IF
+      ANORM = ZLANGE( NORM, N, N, A, LDA, RWORK )
+      RPVGRW = ZLANTR( 'M', 'U', 'N', N, N, AF, LDAF, RWORK )
+      IF( RPVGRW.EQ.ZERO ) THEN
+         RPVGRW = ONE
+      ELSE
+         RPVGRW = ZLANGE( 'M', N, N, A, LDA, RWORK ) / RPVGRW
+      END IF
+*
+*     Compute the reciprocal of the condition number of A.
+*
+      CALL ZGECON( NORM, N, AF, LDAF, ANORM, RCOND, WORK, RWORK, INFO )
+*
+*     Compute the solution matrix X.
+*
+      CALL ZLACPY( 'Full', N, NRHS, B, LDB, X, LDX )
+      CALL ZGETRS( TRANS, N, NRHS, AF, LDAF, IPIV, X, LDX, INFO )
+*
+*     Use iterative refinement to improve the computed solution and
+*     compute error bounds and backward error estimates for it.
+*
+      CALL ZGERFS( TRANS, N, NRHS, A, LDA, AF, LDAF, IPIV, B, LDB, X,
+     $             LDX, FERR, BERR, WORK, RWORK, INFO )
+*
+*     Transform the solution matrix X to a solution of the original
+*     system.
+*
+      IF( NOTRAN ) THEN
+         IF( COLEQU ) THEN
+            DO 80 J = 1, NRHS
+               DO 70 I = 1, N
+                  X( I, J ) = C( I )*X( I, J )
+   70          CONTINUE
+   80       CONTINUE
+            DO 90 J = 1, NRHS
+               FERR( J ) = FERR( J ) / COLCND
+   90       CONTINUE
+         END IF
+      ELSE IF( ROWEQU ) THEN
+         DO 110 J = 1, NRHS
+            DO 100 I = 1, N
+               X( I, J ) = R( I )*X( I, J )
+  100       CONTINUE
+  110    CONTINUE
+         DO 120 J = 1, NRHS
+            FERR( J ) = FERR( J ) / ROWCND
+  120    CONTINUE
+      END IF
+*
+*     Set INFO = N+1 if the matrix is singular to working precision.
+*
+      IF( RCOND.LT.DLAMCH( 'Epsilon' ) )
+     $   INFO = N + 1
+*
+      RWORK( 1 ) = RPVGRW
+      RETURN
+*
+*     End of ZGESVX
+*
+      END
diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c
index 80c66dd7a..5795797d3 100644
--- a/lapack/getf2/getf2_k.c
+++ b/lapack/getf2/getf2_k.c
@@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       GEMV_N(m - j, j, 0, dm1,  a + j, lda, b, 1, b + j, 1, sb);
 
       jp = j + IAMAX_K(m - j, b + j, 1);
-      if (jp>m) jp = m;        //avoid out of boundary
+      if (jp>m) jp = m;        //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723
       ipiv[j + offset] = jp + offset;
       jp--;
       temp1 = *(b + jp);
diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c
index e3d53c96f..6a2137b3e 100644
--- a/lapack/getf2/zgetf2_k.c
+++ b/lapack/getf2/zgetf2_k.c
@@ -99,7 +99,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       GEMV_N(m - j, j, 0, dm1,  ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb);
 
       jp = j + IAMAX_K(m - j, b + j * 2, 1);
-      if (jp>m) jp = m;        //avoid out of boundary
+      if (jp>m) jp = m;        //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723
       ipiv[j + offset] = jp + offset;
       jp--;
 
diff --git a/lapack/laswp/csky/Makefile b/lapack/laswp/csky/Makefile
new file mode 100644
index 000000000..75411deb5
--- /dev/null
+++ b/lapack/laswp/csky/Makefile
@@ -0,0 +1,13 @@
+TOPDIR	= ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP	= ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP	= ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
+
diff --git a/lapack/laswp/loongarch64/Makefile b/lapack/laswp/loongarch64/Makefile
index 71e5a87cb..1c85667ec 100644
--- a/lapack/laswp/loongarch64/Makefile
+++ b/lapack/laswp/loongarch64/Makefile
@@ -1,6 +1,11 @@
 TOPDIR	= ../../..
 include ../../../Makefile.system
 
+ifeq ($(DYNAMIC_ARCH), 1)
+LASWP	= ../generic/laswp_k_4.c
+ZLASWP	= ../generic/zlaswp_k_4.c
+endif
+
 ifndef LASWP
 LASWP	= ../generic/laswp_k.c
 endif
diff --git a/lapack/lauum/lauum_L_parallel.c b/lapack/lauum/lauum_L_parallel.c
index 1b32e4519..0f4eaefaa 100644
--- a/lapack/lauum/lauum_L_parallel.c
+++ b/lapack/lauum/lauum_L_parallel.c
@@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.c = a;
 
     syrk_thread(mode | BLAS_TRANSA_T | BLAS_TRANSB_N | BLAS_UPLO,
-		&newarg, NULL, NULL, (int (*)(void))HERK_LC, sa, sb, args -> nthreads);
+		&newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_LC, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = i;
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.b = a + (i          ) * COMPSIZE;
 
     gemm_thread_n(mode | BLAS_TRANSA_T,
-		  &newarg, NULL, NULL, (int (*)(void))TRMM_LCLN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG))TRMM_LCLN, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = bk;
diff --git a/lapack/lauum/lauum_U_parallel.c b/lapack/lauum/lauum_U_parallel.c
index f5ea54c88..77bfeebc7 100644
--- a/lapack/lauum/lauum_U_parallel.c
+++ b/lapack/lauum/lauum_U_parallel.c
@@ -102,7 +102,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.c = a;
 
     syrk_thread(mode | BLAS_TRANSA_N | BLAS_TRANSB_T,
-		&newarg, NULL, NULL, (int (*)(void))HERK_UN, sa, sb, args -> nthreads);
+		&newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))HERK_UN, sa, sb, args -> nthreads);
 
     newarg.m = i;
     newarg.n = bk;
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     newarg.b = a + (    i * lda) * COMPSIZE;
 
     gemm_thread_m(mode | BLAS_TRANSA_T | BLAS_RSIDE,
-		  &newarg, NULL, NULL, (int (*)(void))TRMM_RCUN, sa, sb, args -> nthreads);
+		  &newarg, NULL, NULL, (int (*)(blas_arg_t*, BLASLONG*, BLASLONG*,FLOAT*, FLOAT*, BLASLONG))TRMM_RCUN, sa, sb, args -> nthreads);
 
     newarg.m = bk;
     newarg.n = bk;
diff --git a/lapack/potrf/potrf_L_parallel.c b/lapack/potrf/potrf_L_parallel.c
index 986816d1a..7d6bcd776 100644
--- a/lapack/potrf/potrf_L_parallel.c
+++ b/lapack/potrf/potrf_L_parallel.c
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       newarg.b = a + (i + bk + i * lda) * COMPSIZE;
 
       gemm_thread_m(mode | BLAS_RSIDE | BLAS_TRANSA_T | BLAS_UPLO,
-		    &newarg, NULL, NULL, (int (*)(void))TRSM_RCLN, sa, sb, args -> nthreads);
+		    &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))TRSM_RCLN, sa, sb, args -> nthreads);
 
       newarg.n = n - i - bk;
       newarg.k = bk;
diff --git a/lapack/potrf/potrf_U_parallel.c b/lapack/potrf/potrf_U_parallel.c
index cc6ff9912..1f1427276 100644
--- a/lapack/potrf/potrf_U_parallel.c
+++ b/lapack/potrf/potrf_U_parallel.c
@@ -110,7 +110,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
       newarg.b = a + (i + (i + bk) * lda) * COMPSIZE;
 
       gemm_thread_n(mode | BLAS_TRANSA_T,
-		    &newarg, NULL, NULL, (int (*)(void))TRSM_LCUN, sa, sb, args -> nthreads);
+		    &newarg, NULL, NULL, (int (*)(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG))TRSM_LCUN, sa, sb, args -> nthreads);
 
       newarg.n = n - i - bk;
       newarg.k = bk;
diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c
index 29364cc05..a7c28f4c2 100644
--- a/lapack/potrf/potrf_parallel.c
+++ b/lapack/potrf/potrf_parallel.c
@@ -80,10 +80,6 @@ static FLOAT dm1 = -1.;
 #define DIVIDE_RATE 2
 #endif
 
-#ifndef SWITCH_RATIO
-#define SWITCH_RATIO 2
-#endif
-
 #ifndef LOWER
 #define TRANS
 #endif
diff --git a/openblas.pc.in b/openblas.pc.in
index 8ad6e8bee..6c27c462b 100644
--- a/openblas.pc.in
+++ b/openblas.pc.in
@@ -2,6 +2,6 @@ Name: openblas
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: ${version}
 URL: https://github.com/xianyi/OpenBLAS
-Libs: -L${libdir} -lopenblas${libsuffix}
+Libs: -L${libdir} -l$(libprefix}openblas${libnamesuffix}
 Libs.private: ${extralib}
 Cflags: -I${includedir}
diff --git a/param.h b/param.h
index f1f5cbdad..8bdc03380 100644
--- a/param.h
+++ b/param.h
@@ -1,5 +1,5 @@
 /*****************************************************************************
-Copyright (c) 2011-2014, The OpenBLAS Project
+Copyright (c) 2011-2023, The OpenBLAS Project
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -2600,13 +2600,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_UNROLL_M 16
 #define SGEMM_DEFAULT_UNROLL_N 8
-#if defined(HAVE_GAS) && (HAVE_GAS == 1) 
-#define DGEMM_DEFAULT_UNROLL_M 16
-#define DGEMM_DEFAULT_UNROLL_N 4
-#else
 #define DGEMM_DEFAULT_UNROLL_M 8
 #define DGEMM_DEFAULT_UNROLL_N 8
-#endif
 #define CGEMM_DEFAULT_UNROLL_M 8
 #define CGEMM_DEFAULT_UNROLL_N 4
 #define ZGEMM_DEFAULT_UNROLL_M 8
@@ -2748,19 +2743,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
 
-#ifdef HAVE_MSA
-#define SGEMM_DEFAULT_UNROLL_M  8
-#define SGEMM_DEFAULT_UNROLL_N  8
-
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
-
-#define CGEMM_DEFAULT_UNROLL_M  8
-#define CGEMM_DEFAULT_UNROLL_N  4
-
-#define ZGEMM_DEFAULT_UNROLL_M  4
-#define ZGEMM_DEFAULT_UNROLL_N  4
-#else
+#if defined(NO_MSA)
 #define SGEMM_DEFAULT_UNROLL_M  8
 #define SGEMM_DEFAULT_UNROLL_N  4
 
@@ -2772,6 +2755,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
+#else
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
 #endif
 
 #define SGEMM_DEFAULT_P	64
@@ -2845,31 +2840,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x0ffffUL
 
+#if defined(NO_LASX)
+#define DGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_M 2
 #define SGEMM_DEFAULT_UNROLL_N 8
+#define SGEMM_DEFAULT_UNROLL_M 2
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_M 1
+#define ZGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_M 1
+#else
 #define DGEMM_DEFAULT_UNROLL_N 4
-#define QGEMM_DEFAULT_UNROLL_N 2
+#define DGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define SGEMM_DEFAULT_UNROLL_M 16
 #define CGEMM_DEFAULT_UNROLL_N 4
+#define CGEMM_DEFAULT_UNROLL_M 16
 #define ZGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_M 8
+#endif
+
+#define QGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
 
-#define SGEMM_DEFAULT_UNROLL_M 2
-#define DGEMM_DEFAULT_UNROLL_M 16
 #define QGEMM_DEFAULT_UNROLL_M 2
-#define CGEMM_DEFAULT_UNROLL_M 1
-#define ZGEMM_DEFAULT_UNROLL_M 1
 #define XGEMM_DEFAULT_UNROLL_M 1
 
-#define SGEMM_DEFAULT_P 512
+#define SGEMM_DEFAULT_P 256
 #define DGEMM_DEFAULT_P 32
 #define CGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_R 12288
+#define SGEMM_DEFAULT_R 1024
 #define DGEMM_DEFAULT_R 858
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#define SGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 256
 #define DGEMM_DEFAULT_Q 152
 #define CGEMM_DEFAULT_Q 128
 #define ZGEMM_DEFAULT_Q 128
@@ -2885,13 +2892,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  8
 
-#define DGEMM_DEFAULT_UNROLL_M  2
-#define DGEMM_DEFAULT_UNROLL_N  8
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
 
-#define CGEMM_DEFAULT_UNROLL_M  1
+#define CGEMM_DEFAULT_UNROLL_M  8
 #define CGEMM_DEFAULT_UNROLL_N  4
 
-#define ZGEMM_DEFAULT_UNROLL_M  1
+#define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
 #define SGEMM_DEFAULT_P	128
@@ -2955,19 +2962,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B  0
 #define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL
 
-#if defined(HAVE_MSA)
-#define SGEMM_DEFAULT_UNROLL_M  8
-#define SGEMM_DEFAULT_UNROLL_N  8
-
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
-
-#define CGEMM_DEFAULT_UNROLL_M  8
-#define CGEMM_DEFAULT_UNROLL_N  4
-
-#define ZGEMM_DEFAULT_UNROLL_M  4
-#define ZGEMM_DEFAULT_UNROLL_N  4
-#else
+#if defined(NO_MSA)
 #define SGEMM_DEFAULT_UNROLL_M  2
 #define SGEMM_DEFAULT_UNROLL_N  2
 
@@ -2979,6 +2974,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
+#else
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
 #endif
 
 #define SGEMM_DEFAULT_P  128
@@ -3038,6 +3045,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#if defined(x280)
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16 // 4 // 16 // 2
+#define SGEMM_DEFAULT_UNROLL_N  8// 4 // 4 // 2
+
+/* SGEMM_UNROLL_MN is calculated as max(SGEMM_UNROLL_M, SGEMM_UNROLL_N)
+ * Since we don't define SGEMM_UNROLL_M correctly we have to manually set this macro.
+ * If VLMAX size is ever more than 1024, this should be increased also. */
+#define SGEMM_DEFAULT_UNROLL_MN  32
+
+#define DGEMM_DEFAULT_UNROLL_M  16 //2 // 8
+#define DGEMM_DEFAULT_UNROLL_N  8 //2 // 4
+#define DGEMM_DEFAULT_UNROLL_MN  32
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN 32
+
+#define ZGEMM_DEFAULT_UNROLL_M  8
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN 16
+
+#define SGEMM_DEFAULT_P	160
+#define DGEMM_DEFAULT_P	160
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
 #ifdef C910V
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
@@ -3077,6 +3130,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
+#ifdef RISCV64_ZVL128B
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M 8
+#define SGEMM_DEFAULT_UNROLL_N 8
+
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 4
+
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+
+#define ZGEMM_DEFAULT_UNROLL_M 4
+#define ZGEMM_DEFAULT_UNROLL_N 4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P 16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
+#ifdef RISCV64_ZVL256B
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  8
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  8
+
+#define ZGEMM_DEFAULT_UNROLL_M  8
+#define ZGEMM_DEFAULT_UNROLL_N  4
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 64
+#define CGEMM_DEFAULT_P 64
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 128
+#define ZGEMM_DEFAULT_Q 64
+
+#define SGEMM_DEFAULT_R 16384
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 8192
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P 16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
 #ifdef ARMV7
 #define SNUMOPT		2
 #define DNUMOPT		2
@@ -3338,6 +3469,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(NEOVERSEN1)
 
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
+
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
 
@@ -3350,13 +3487,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
-#define SGEMM_DEFAULT_P	128
-#define DGEMM_DEFAULT_P	160
+#define SGEMM_DEFAULT_P	240
+#define DGEMM_DEFAULT_P	240
 #define CGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_Q 352
-#define DGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 640
+#define DGEMM_DEFAULT_Q 320
 #define CGEMM_DEFAULT_Q 224
 #define ZGEMM_DEFAULT_Q 112
 
@@ -3365,29 +3502,35 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#elif defined(NEOVERSEV1)
+#elif defined(NEOVERSEV1) // 256-bit SVE
 
-#define SWITCH_RATIO  16
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
 
 #define SGEMM_DEFAULT_UNROLL_M  16
-#define SGEMM_DEFAULT_UNROLL_N  4
+#define SGEMM_DEFAULT_UNROLL_N  8
 
-#define DGEMM_DEFAULT_UNROLL_M  8
-#define DGEMM_DEFAULT_UNROLL_N  4
+#define DGEMM_DEFAULT_UNROLL_M  4 // Actually 2VL (8) but kept separate to keep copies separate
+#define DGEMM_DEFAULT_UNROLL_N  8
 
-#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_M  2
 #define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN 16
 
-#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN 16
 
-#define SGEMM_DEFAULT_P 128
-#define DGEMM_DEFAULT_P 160
+#define SGEMM_DEFAULT_P 240
+#define DGEMM_DEFAULT_P 240
 #define CGEMM_DEFAULT_P 128
 #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_Q 352
-#define DGEMM_DEFAULT_Q 128
+#define SGEMM_DEFAULT_Q 640
+#define DGEMM_DEFAULT_Q 320
 #define CGEMM_DEFAULT_Q 224
 #define ZGEMM_DEFAULT_Q 112
 
@@ -3398,6 +3541,12 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #elif defined(NEOVERSEN2)
 
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
+
 #undef SBGEMM_ALIGN_K
 #define SBGEMM_ALIGN_K 4
 
@@ -3433,7 +3582,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2)
+#elif defined(A64FX) // 512-bit SVE
 
 /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl".
 Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */
@@ -3474,6 +3623,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
+#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE
+
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            8
+#else
+#define SWITCH_RATIO            16
+#endif
+
+#define SGEMM_DEFAULT_UNROLL_M  4 // Actually 1VL (8) but kept seperate to keep copies seperate
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  4
+#define DGEMM_DEFAULT_UNROLL_N  8
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  4
+#define CGEMM_DEFAULT_UNROLL_MN  16
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#define ZGEMM_DEFAULT_UNROLL_MN  16
+
+#define SGEMM_DEFAULT_P 128
+#define DGEMM_DEFAULT_P 160
+#define CGEMM_DEFAULT_P 128
+#define ZGEMM_DEFAULT_P 128
+
+#define SGEMM_DEFAULT_Q 352
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 224
+#define ZGEMM_DEFAULT_Q 112
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
 #else /* Other/undetected ARMv8 cores */
 
 #define SGEMM_DEFAULT_UNROLL_M  16
@@ -3749,7 +3935,44 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 #define SYMV_P	16
 #endif
 
+#if defined(CSKY) || defined(CK860FV)
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  2
+#define SGEMM_DEFAULT_UNROLL_N  2
+
+#define DGEMM_DEFAULT_UNROLL_M  2
+#define DGEMM_DEFAULT_UNROLL_N  2
 
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	128
+#define DGEMM_DEFAULT_P	128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
 
 #ifdef GENERIC
 
@@ -3838,6 +4061,10 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout
 
 #endif
 
+#ifndef SWITCH_RATIO
+#define SWITCH_RATIO 2
+#endif
+
 #ifndef QGEMM_DEFAULT_UNROLL_M
 #define QGEMM_DEFAULT_UNROLL_M 2
 #endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index d68b12d87..4ebd5348c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -22,9 +22,22 @@ if (BUILD_COMPLEX16)
 	list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
 endif()
 
+if (USE_GEMM3M)
+	if (BUILD_COMPLEX)
+		list (APPEND OpenBLAS_Tests cblat3_3m)
+	endif ()
+	if (BUILD_COMPLEX16)
+		list (APPEND OpenBLAS_Tests zblat3_3m)
+	endif ()
+endif ()
+
 foreach(test_bin ${OpenBLAS_Tests})
 	add_executable(${test_bin} ${test_bin}.f)
 target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME})
+if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
+	string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
+target_link_libraries(${test_bin} omp pthread)
+endif()
 endforeach()
 
 # $1 exec, $2 input, $3 output_result
@@ -78,4 +91,10 @@ add_test(NAME "${float_type}blas2"
   COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat2> "${PROJECT_SOURCE_DIR}/test/${float_type}blat2.dat" ${float_type_upper}BLAT2.SUMM)
 add_test(NAME "${float_type}blas3" 
   COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3.dat" ${float_type_upper}BLAT3.SUMM)
+if (USE_GEMM3M)
+if ((${float_type} STREQUAL "c") OR (${float_type} STREQUAL "z"))
+add_test(NAME "${float_type}blas3_3m" 
+  COMMAND ${helper_prefix} $<TARGET_FILE:${float_type}blat3_3m> "${PROJECT_SOURCE_DIR}/test/${float_type}blat3_3m.dat" ${float_type_upper}BLAT3_3M.SUMM)
+endif()
+endif()
 endforeach()
diff --git a/test/Makefile b/test/Makefile
index 923f1537c..6a50b6c98 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -4,6 +4,24 @@ ifeq ($(F_COMPILER),GFORTRAN)
         override FFLAGS += -fno-tree-vectorize
 endif
 
+SUPPORT_GEMM3M = 0
+
+ifeq ($(ARCH), x86)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), x86_64)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), ia64)
+SUPPORT_GEMM3M = 1
+endif
+
+ifeq ($(ARCH), MIPS)
+SUPPORT_GEMM3M = 1
+endif
+
 ifeq ($(NOFORTRAN),1)
 all ::
 else
@@ -153,11 +171,20 @@ ifeq ($(BUILD_DOUBLE),1)
 D3=dblat3
 endif
 ifeq ($(BUILD_COMPLEX),1)
+ifeq ($(SUPPORT_GEMM3M),1)
+C3=cblat3 cblat3_3m
+else
 C3=cblat3
 endif
+endif
 ifeq ($(BUILD_COMPLEX16),1)
+ifeq ($(SUPPORT_GEMM3M),1)
+Z3=zblat3 zblat3_3m
+else
 Z3=zblat3
 endif
+endif
+
 
 level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
 
@@ -260,17 +287,26 @@ endif
 
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
+CLDFLAGS = $(CFLAGS) $(LDFLAGS)
 
 
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
-CEXTRALIB = -lomp
+CEXTRALIB += -lomp
 endif
 endif
 ifeq ($(F_COMPILER), NAG)
 CEXTRALIB = -lgomp
 endif
+ifeq ($(F_COMPILER), IBM)
+ifeq ($(C_COMPILER), GCC)
+CEXTRALIB += -lgomp
+endif
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB += -lomp
+endif
+endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)
@@ -326,7 +362,7 @@ endif
 
 ifeq ($(BUILD_BFLOAT16),1)
 test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+	$(CC) $(CLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 endif
 
 ifeq ($(BUILD_COMPLEX),1)
diff --git a/test/compare_sgemm_sbgemm.c b/test/compare_sgemm_sbgemm.c
index 276fecae9..bc74233ab 100644
--- a/test/compare_sgemm_sbgemm.c
+++ b/test/compare_sgemm_sbgemm.c
@@ -32,7 +32,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 typedef union
 {
   unsigned short v;
+#if defined(_AIX)
+  struct __attribute__((packed))
+#else
   struct
+#endif
   {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     unsigned short s:1;
@@ -49,7 +53,11 @@ typedef union
 typedef union
 {
   float v;
+#if defined(_AIX)
+  struct __attribute__((packed))
+#else
   struct
+#endif
   {
 #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
     uint32_t s:1;
@@ -92,6 +100,8 @@ main (int argc, char *argv[])
       float C[m * n];
       bfloat16_bits AA[m * k], BB[k * n];
       float DD[m * n], CC[m * n];
+      bfloat16 atmp,btmp;
+      blasint one=1;
 
       for (j = 0; j < m; j++)
 	{
@@ -100,16 +110,18 @@ main (int argc, char *argv[])
 	      A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
 	      C[j * k + i] = 0;
-	      AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16;
-	      BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16;
+	      sbstobf16_(&one, &A[j*k+i], &one, &atmp, &one);
+	      sbstobf16_(&one, &B[j*k+i], &one, &btmp, &one);
+	      AA[j * k + i].v = atmp;
+	      BB[j * k + i].v = btmp;
 	      CC[j * k + i] = 0;
 	      DD[j * k + i] = 0;
 	    }
 	}
       SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
 	     &m, B, &k, &beta, C, &m);
-      SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
-	      &m, BB, &k, &beta, CC, &m);
+      SBGEMM (&transA, &transB, &m, &n, &k, &alpha, (bfloat16*) AA,
+	      &m, (bfloat16*)BB, &k, &beta, CC, &m);
       for (i = 0; i < n; i++)
 	for (j = 0; j < m; j++)
 	    if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0)
diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 2e32827d3..4771d8a27 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -15,9 +15,76 @@ else ()
     test_dsdot.c
     test_dnrm2.c
     test_swap.c
+    test_zscal.c
+    test_amin.c
+    test_axpby.c
   )
 endif ()
 
+
+set(DIR_EXT test_extensions)
+set(OpenBLAS_utest_ext_src
+utest_main.c
+${DIR_EXT}/common.c
+${DIR_EXT}/xerbla.c
+${DIR_EXT}/test_isamin.c 
+${DIR_EXT}/test_idamin.c 
+${DIR_EXT}/test_icamin.c 
+${DIR_EXT}/test_izamin.c 
+${DIR_EXT}/test_ssum.c 
+${DIR_EXT}/test_dsum.c 
+${DIR_EXT}/test_scsum.c 
+${DIR_EXT}/test_dzsum.c
+${DIR_EXT}/test_samin.c
+${DIR_EXT}/test_damin.c
+${DIR_EXT}/test_scamin.c
+${DIR_EXT}/test_dzamin.c
+${DIR_EXT}/test_scamax.c
+${DIR_EXT}/test_dzamax.c
+${DIR_EXT}/test_zrotg.c
+${DIR_EXT}/test_crotg.c
+${DIR_EXT}/test_drotmg.c
+${DIR_EXT}/test_srotmg.c
+${DIR_EXT}/test_zscal.c
+${DIR_EXT}/test_cscal.c
+${DIR_EXT}/test_domatcopy.c
+${DIR_EXT}/test_somatcopy.c
+${DIR_EXT}/test_zomatcopy.c
+${DIR_EXT}/test_comatcopy.c
+${DIR_EXT}/test_simatcopy.c
+${DIR_EXT}/test_dimatcopy.c
+${DIR_EXT}/test_cimatcopy.c
+${DIR_EXT}/test_zimatcopy.c
+${DIR_EXT}/test_sgeadd.c
+${DIR_EXT}/test_dgeadd.c
+${DIR_EXT}/test_cgeadd.c
+${DIR_EXT}/test_zgeadd.c
+${DIR_EXT}/test_saxpby.c
+${DIR_EXT}/test_daxpby.c
+${DIR_EXT}/test_caxpby.c
+${DIR_EXT}/test_zaxpby.c
+${DIR_EXT}/test_caxpyc.c
+${DIR_EXT}/test_zaxpyc.c
+${DIR_EXT}/test_cgemv_t.c
+${DIR_EXT}/test_zgemv_t.c
+${DIR_EXT}/test_cgemv_n.c
+${DIR_EXT}/test_zgemv_n.c
+${DIR_EXT}/test_crot.c
+${DIR_EXT}/test_zrot.c
+${DIR_EXT}/test_cgbmv.c
+${DIR_EXT}/test_zgbmv.c
+${DIR_EXT}/test_dgemmt.c
+${DIR_EXT}/test_sgemmt.c
+${DIR_EXT}/test_cgemmt.c
+${DIR_EXT}/test_zgemmt.c
+${DIR_EXT}/test_ztrmv.c
+${DIR_EXT}/test_ctrmv.c
+${DIR_EXT}/test_ztrsv.c
+${DIR_EXT}/test_ctrsv.c
+${DIR_EXT}/test_zgemm.c
+${DIR_EXT}/test_cgemm.c
+)
+
 # crashing on travis cl with an error code suggesting resource not found
 if (NOT MSVC)
 set(OpenBLAS_utest_src
@@ -46,6 +113,13 @@ set(OpenBLAS_utest_src
   ${OpenBLAS_utest_src}
   test_potrs.c
   )
+set(OpenBLAS_utest_ext_src
+  ${OpenBLAS_utest_ext_src}
+  ${DIR_EXT}/test_cspmv.c
+  ${DIR_EXT}/test_zspmv.c
+  ${DIR_EXT}/test_csbmv.c
+  ${DIR_EXT}/test_zsbmv.c
+  )
 if (NOT NO_CBLAS AND NOT NO_LAPACKE)
 set(OpenBLAS_utest_src
   ${OpenBLAS_utest_src}
@@ -57,7 +131,11 @@ endif()
 set(OpenBLAS_utest_bin openblas_utest)
 add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src})
 
+set(OpenBLAS_utest_ext_bin openblas_utest_ext)
+add_executable(${OpenBLAS_utest_ext_bin} ${OpenBLAS_utest_ext_src})
+
 target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME})
+target_link_libraries(${OpenBLAS_utest_ext_bin} ${OpenBLAS_LIBNAME})
 
 if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX"  )
 target_link_libraries(${OpenBLAS_utest_bin} m)
@@ -82,3 +160,4 @@ add_custom_command(TARGET ${OpenBLAS_utest_bin}
 endif()
 
 add_test(${OpenBLAS_utest_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin})
+add_test(${OpenBLAS_utest_ext_bin} ${CMAKE_CURRENT_BINARY_DIR}/${OpenBLAS_utest_bin})
diff --git a/utest/Makefile b/utest/Makefile
index f99035440..0e3f2b8a7 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -1,21 +1,39 @@
 UTEST_CHECK = 1
 TOPDIR	= ..
+DIR_EXT=test_extensions
 
 override TARGET_ARCH=
 override TARGET_MACH=
 
 UTESTBIN=openblas_utest
+UTESTEXTBIN=openblas_utest_ext
 
 .PHONY : all
-.NOTPARALLEL : all run_test $(UTESTBIN)
+.NOTPARALLEL : all run_test $(UTESTBIN) $(UTESTEXTBIN)
 
 include $(TOPDIR)/Makefile.system
 
-OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o
+OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \
+     test_amin.o test_axpby.o
 #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o
+OBJS_EXT=utest_main.o $(DIR_EXT)/xerbla.o $(DIR_EXT)/common.o 
+OBJS_EXT+=$(DIR_EXT)/test_isamin.o $(DIR_EXT)/test_idamin.o $(DIR_EXT)/test_icamin.o $(DIR_EXT)/test_izamin.o 
+OBJS_EXT+=$(DIR_EXT)/test_ssum.o $(DIR_EXT)/test_dsum.o $(DIR_EXT)/test_scsum.o $(DIR_EXT)/test_dzsum.o
+OBJS_EXT+=$(DIR_EXT)/test_saxpby.o $(DIR_EXT)/test_daxpby.o $(DIR_EXT)/test_caxpby.o $(DIR_EXT)/test_zaxpby.o $(DIR_EXT)/test_zaxpyc.o $(DIR_EXT)/test_caxpyc.o
+OBJS_EXT+=$(DIR_EXT)/test_samin.o $(DIR_EXT)/test_damin.o $(DIR_EXT)/test_scamin.o $(DIR_EXT)/test_dzamin.o $(DIR_EXT)/test_scamax.o $(DIR_EXT)/test_dzamax.o
+OBJS_EXT+=$(DIR_EXT)/test_drotmg.o $(DIR_EXT)/test_srotmg.o $(DIR_EXT)/test_zrotg.o $(DIR_EXT)/test_crotg.o $(DIR_EXT)/test_crot.o $(DIR_EXT)/test_zrot.o
+OBJS_EXT+=$(DIR_EXT)/test_zscal.o $(DIR_EXT)/test_cscal.o
+OBJS_EXT+=$(DIR_EXT)/test_domatcopy.o $(DIR_EXT)/test_somatcopy.o $(DIR_EXT)/test_zomatcopy.o $(DIR_EXT)/test_comatcopy.o
+OBJS_EXT+=$(DIR_EXT)/test_simatcopy.o $(DIR_EXT)/test_dimatcopy.o $(DIR_EXT)/test_cimatcopy.o $(DIR_EXT)/test_zimatcopy.o
+OBJS_EXT+=$(DIR_EXT)/test_sgeadd.o $(DIR_EXT)/test_dgeadd.o $(DIR_EXT)/test_cgeadd.o $(DIR_EXT)/test_zgeadd.o
+OBJS_EXT+=$(DIR_EXT)/test_cgemv_t.o $(DIR_EXT)/test_zgemv_t.o $(DIR_EXT)/test_cgemv_n.o $(DIR_EXT)/test_zgemv_n.o
+OBJS_EXT+=$(DIR_EXT)/test_sgemmt.o $(DIR_EXT)/test_dgemmt.o $(DIR_EXT)/test_cgemmt.o $(DIR_EXT)/test_zgemmt.o
+OBJS_EXT+=$(DIR_EXT)/test_ztrmv.o $(DIR_EXT)/test_ctrmv.o $(DIR_EXT)/test_ztrsv.o $(DIR_EXT)/test_ctrsv.o
+OBJS_EXT+=$(DIR_EXT)/test_zgemm.o $(DIR_EXT)/test_cgemm.o $(DIR_EXT)/test_zgbmv.o $(DIR_EXT)/test_cgbmv.o
 
 ifneq ($(NO_LAPACK), 1)
 OBJS += test_potrs.o
+OBJS_EXT += $(DIR_EXT)/test_zspmv.o $(DIR_EXT)/test_cspmv.o $(DIR_EXT)/test_zsbmv.o $(DIR_EXT)/test_csbmv.o
 ifneq ($(NO_CBLAS), 1)
 ifneq ($(NO_LAPACKE), 1)
 OBJS += test_kernel_regress.o
@@ -44,15 +62,30 @@ endif
 
 all : run_test
 
+ifeq ($(OSNAME), AIX)
+ifeq ($(USE_OPENMP), 1)
+$(UTESTBIN): $(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB)
+else
+$(UTESTBIN): $(OBJS)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
+endif
+else
 $(UTESTBIN): $(OBJS)
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
+endif
+
+$(UTESTEXTBIN): $(OBJS_EXT)
+	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
 
-run_test: $(UTESTBIN)
+run_test: $(UTESTBIN) $(UTESTEXTBIN)
 ifneq ($(CROSS), 1)
 	./$(UTESTBIN)
+	./$(UTESTEXTBIN)
 endif
 
 clean:
-	-rm -f *.o $(UTESTBIN)
+	-rm -f *.o $(UTESTBIN) $(UTESTEXTBIN)
+	-rm -f $(DIR_EXT)/*.o
 
-libs:
+libs:
\ No newline at end of file
diff --git a/utest/ctest.h b/utest/ctest.h
index 79961badf..b158b4538 100644
--- a/utest/ctest.h
+++ b/utest/ctest.h
@@ -41,7 +41,7 @@ typedef void (*RunWithDataFunc)(void*);
 struct ctest {
     const char* ssname;  // suite name
     const char* ttname;  // test name
-    void (*run)();
+    void (*run)(void);
     int skip;
 
     void* data;
@@ -159,9 +159,9 @@ struct ctest {
     void WEAK sname##_teardown(struct sname##_data* data)
 
 #define __CTEST_INTERNAL(sname, tname, _skip) \
-    void __FNAME(sname, tname)(); \
+    void __FNAME(sname, tname)(void); \
     __CTEST_STRUCT(sname, tname, _skip, NULL, NULL, NULL) \
-    void __FNAME(sname, tname)()
+    void __FNAME(sname, tname)(void)
 
 #ifdef __CTEST_APPLE
 #define SETUP_FNAME(sname) NULL
@@ -366,7 +366,7 @@ void __ctest_addTest(struct ctest *test)
 #ifndef __CTEST_MSVC
 /* Add all tests to linked list automatically.
  */
-static void __ctest_linkTests()
+static void __ctest_linkTests(void)
 {
     struct ctest ** test;
     struct ctest ** ctest_begin = (struct ctest **)__PNAME(suite, test);
@@ -401,7 +401,7 @@ static void __ctest_linkTests()
     __ctest_head_p = ctest_begin;
 }
 #else //for msvc
-static void __ctest_linkTests()
+static void __ctest_linkTests(void)
 {
     struct ctest ** ctest_start = __ctest_head_p;
     struct ctest ** test;
@@ -450,7 +450,7 @@ static void msg_start(const char* color, const char* title) {
     print_errormsg("  %s: ", title);
 }
 
-static void msg_end() {
+static void msg_end(void) {
     if (color_output) {
     	print_errormsg(ANSI_NORMAL);
     }
@@ -634,7 +634,7 @@ static int suite_test_filter(struct ctest* t) {
 
 
 #ifndef __CTEST_NO_TIME
-static uint64_t getCurrentTime() {
+static uint64_t getCurrentTime(void) {
     struct timeval now;
     gettimeofday(&now, NULL);
     uint64_t now64 = (uint64_t) now.tv_sec;
diff --git a/utest/test_amax.c b/utest/test_amax.c
index a9e5a1c85..e9775caf0 100644
--- a/utest/test_amax.c
+++ b/utest/test_amax.c
@@ -1,5 +1,5 @@
 /*****************************************************************************
-Copyright (c) 2011-2016, The OpenBLAS Project
+Copyright (c) 2011-2024, The OpenBLAS Project
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -13,9 +13,9 @@ met:
       notice, this list of conditions and the following disclaimer in
       the documentation and/or other materials provided with the
       distribution.
-   3. Neither the name of the OpenBLAS project nor the names of 
-      its contributors may be used to endorse or promote products 
-      derived from this software without specific prior written 
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
       permission.
 
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
@@ -57,4 +57,31 @@ CTEST(amax, damax){
   ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
 }
 #endif
+#ifdef BUILD_COMPLEX
+CTEST(amax, scamax){
+  blasint N = 9, inc = 1;
+  float te_max = 0.0, tr_max = 0.0;
+  float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	        -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		-7.7, 8.8 };
 
+  te_max = BLASFUNC(scamax)(&N, x, &inc);
+  tr_max = 20.0;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS);
+}
+#endif
+#ifdef BUILD_COMPLEX16
+CTEST(amax, dzamax){
+  blasint N = 9, inc = 1;
+  double te_max = 0.0, tr_max = 0.0;
+  double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	         -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		 -7.7, 8.8 };
+
+  te_max = BLASFUNC(dzamax)(&N, x, &inc);
+  tr_max = 20.0;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
+}
+#endif
diff --git a/utest/test_amin.c b/utest/test_amin.c
new file mode 100644
index 000000000..1305ab8ef
--- /dev/null
+++ b/utest/test_amin.c
@@ -0,0 +1,89 @@
+/*****************************************************************************
+Copyright (c) 2011-2024, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "openblas_utest.h"
+
+#ifdef BUILD_SINGLE
+CTEST(amin, samin){
+  blasint N = 3, inc = 1;
+  float te_min = 0.0, tr_min = 0.0;
+  float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	        -9.9 };
+
+  te_min = BLASFUNC(samin)(&N, x, &inc);
+  tr_min = 1.1;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS);
+}
+#endif
+#ifdef BUILD_DOUBLE
+CTEST(amin, damin){
+  blasint N = 3, inc = 1;
+  double te_min = 0.0, tr_min = 0.0;
+  double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	         -9.9 };
+
+  te_min = BLASFUNC(damin)(&N, x, &inc);
+  tr_min = 1.1;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS);
+}
+#endif
+#ifdef BUILD_COMPLEX
+CTEST(amin, scamin){
+  blasint N = 9, inc = 1;
+  float te_min = 0.0, tr_min = 0.0;
+  float x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	        -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		-7.7, 8.8 };
+
+  te_min = BLASFUNC(scamin)(&N, x, &inc);
+  tr_min = 3.3;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS);
+}
+#endif
+#ifdef BUILD_COMPLEX16
+CTEST(amin, dzamin){
+  blasint N = 9, inc = 1;
+  double te_min = 0.0, tr_min = 0.0;
+  double x[] = { -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8,
+	         -9.9, 10.10, -1.1, 2.2, -3.3, 4.4, -5.5, 6.6,
+		 -7.7, 8.8 };
+
+  te_min = BLASFUNC(dzamin)(&N, x, &inc);
+  tr_min = 3.3;
+
+  ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS);
+}
+#endif
diff --git a/utest/test_axpby.c b/utest/test_axpby.c
new file mode 100644
index 000000000..37ba8ad14
--- /dev/null
+++ b/utest/test_axpby.c
@@ -0,0 +1,320 @@
+/*****************************************************************************
+Copyright (c) 2011-2024, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "openblas_utest.h"
+
+#ifdef BUILD_SINGLE
+CTEST(axpby, saxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    float alpha = 1.0, beta = 2.0;
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, saxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    float alpha = 0.25, beta = 0.75;
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, saxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    float alpha = 0.25, beta = 0.75;
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(saxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00,
+                   9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00,
+                   7.75, 10.00 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+#endif
+
+#ifdef BUILD_DOUBLE
+CTEST(axpby, daxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    double alpha = 1.0, beta  = 2.0;
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 1535.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, daxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    double alpha = 0.25, beta = 0.75;
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 1.75, 3.75, 5.75, 7.75, 1.75, 3.75, 5.75, 7.75, 9.75 };
+
+    for(i = 0; i < N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, daxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    double alpha = 0.25, beta = 0.75;
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                    1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                    2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(daxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                    1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 1.75, 4.00, 5.75, 8.00, 1.75, 4.00, 5.75, 8.00,
+                    9.75, 2.00, 3.75, 6.00, 7.75, 2.00, 3.75, 6.00,
+                    7.75, 10.00 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+#endif
+
+#ifdef BUILD_COMPLEX
+CTEST(axpby, caxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    float alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 };
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0,
+                   10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, caxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(caxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5,
+                   8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+
+CTEST(axpby, caxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    float alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    float x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(caxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    float x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    float y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0,
+                   11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5,
+                   2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0,
+                   2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 };
+
+    for(i = 0; i < 4 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], SINGLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
+    }
+}
+#endif
+
+#ifdef BUILD_COMPLEX16
+CTEST(axpby, zaxpby_inc_0)
+{
+    blasint i;
+    blasint N = 9, incX = 0, incY = 0;
+    double alpha[] = { 1.0, 2.0 }, beta[] = { 2.0, 1.0 };
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { 9355.0, -8865.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0,
+                   10.0, 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, zaxpby_inc_1)
+{
+    blasint i;
+    blasint N = 9, incX = 1, incY = 1;
+    double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(zaxpby)(&N, alpha, x1, &incX, beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { -2.0, 5.5, -2.0, 13.5, -2.0, 5.5, -2.0, 13.5,
+                   8.0, 11.5, -2.0, 9.5, 6.0, 9.5, -2.0, 9.5, -2.0, 17.5 };
+
+    for(i = 0; i < 2 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+
+CTEST(axpby, zaxpby_inc_2)
+{
+    blasint i;
+    blasint N = 9, incX = 2, incY = 2;
+    double alpha[] = { 0.25, 0.25 }, beta[] = { 0.75, 0.75 };
+    double x1[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y1[] = { 2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0,
+                   2.0, 4.0, 6.0, 8.0, 2.0, 4.0, 6.0, 8.0, 10.0 };
+
+    BLASFUNC(zaxpby)(&N, &alpha, x1, &incX, &beta, y1, &incY);
+
+    double x2[] = { 1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0,
+                   1.0, 3.0, 5.0, 7.0, 1.0, 3.0, 5.0, 7.0, 9.0 };
+    double y2[] = { -2.0, 5.5, 6.0, 8.0, -2.0, 5.5, 6.0, 8.0, 8.0,
+                   11.5, 4.0, 6.0, 6.0, 9.5, 4.0, 6.0, -2.0, 17.5,
+                   2.0, 4.0, -2.0, 13.5, 2.0, 4.0, -2.0, 13.5, 10.0,
+                   2.0, -2.0, 9.5, 8.0, 2.0, -2.0, 9.5, 8.0, 10.0 };
+
+    for(i = 0; i < 4 * N; i++){
+        ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+        ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+    }
+}
+#endif
diff --git a/utest/test_axpy.c b/utest/test_axpy.c
index 5fd7c1b04..26005e70f 100644
--- a/utest/test_axpy.c
+++ b/utest/test_axpy.c
@@ -74,6 +74,26 @@ CTEST(axpy,zaxpy_inc_0)
 		ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
 	}
 }
+
+CTEST(axpy,zaxpy_incx_0)
+{
+        blasint i;
+        blasint N=4,incX=0,incY=1;
+        double a[2]={0.25,0.5};
+        double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+        double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0};
+        double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+        double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25};
+
+        //OpenBLAS
+        BLASFUNC(zaxpy)(&N,a,x1,&incX,y1,&incY);
+
+        for(i=0; i<2*N; i++){
+		ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+                ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+        }
+}
+
 #endif
 
 #ifdef BUILD_SINGLE
@@ -116,5 +136,24 @@ CTEST(axpy,caxpy_inc_0)
 		ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
 	}
 }
+
+CTEST(axpy,caxpy_incx_0)
+{
+	blasint i;
+	blasint N=4,incX=0,incY=1;
+	float a[2]={0.25,0.5};
+	float x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+	float y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0};
+        double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0};
+        double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25};
+
+	//OpenBLAS
+	BLASFUNC(caxpy)(&N,a,x1,&incX,y1,&incY);
+
+	for(i=0; i<2*N; i++){
+		ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS);
+		ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
+	}
+}
 #endif
 
diff --git a/utest/test_extensions/common.c b/utest/test_extensions/common.c
new file mode 100644
index 000000000..8a6a47795
--- /dev/null
+++ b/utest/test_extensions/common.c
@@ -0,0 +1,259 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common.h"
+
+/**
+ * Generate random array
+ */
+void srand_generate(float *alpha, blasint n)
+{
+    blasint i;
+    for (i = 0; i < n; i++)
+        alpha[i] = (float)rand() / (float)RAND_MAX;
+}
+
+void drand_generate(double *alpha, blasint n)
+{
+    blasint i;
+    for (i = 0; i < n; i++)
+        alpha[i] = (double)rand() / (double)RAND_MAX;
+}
+
+/**
+ * Find difference between two rectangle matrix
+ * return norm of differences
+ */
+float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint ld)
+{
+    blasint i = 0;
+    blasint j = 0;
+    blasint inc = 1;
+    float norm = 0.0f;
+
+    float *a_ptr = a;
+    float *b_ptr = b;
+
+    for(i = 0; i < rows; i++)
+    {
+        for (j = 0; j < cols; j++) {
+            a_ptr[j] -= b_ptr[j];
+        }
+        norm += cblas_snrm2(cols, a_ptr, inc);
+        
+        a_ptr += ld;
+        b_ptr += ld;
+    }
+    return norm/(float)(rows);
+}
+
+double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blasint ld)
+{
+    blasint i = 0;
+    blasint j = 0;
+    blasint inc = 1;
+    double norm = 0.0;
+
+    double *a_ptr = a;
+    double *b_ptr = b;
+
+    for(i = 0; i < rows; i++)
+    {
+        for (j = 0; j < cols; j++) {
+            a_ptr[j] -= b_ptr[j];
+        }
+        norm += cblas_dnrm2(cols, a_ptr, inc);
+        
+        a_ptr += ld;
+        b_ptr += ld;
+    }
+    return norm/(double)(rows);
+}
+
+/**
+ * Complex conjugate operation for vector
+ * 
+ * param n specifies number of elements in vector x
+ * param inc_x specifies increment of vector x
+ * param x_ptr specifies buffer holding vector x
+ */
+void cconjugate_vector(blasint n, blasint inc_x, float *x_ptr)
+{
+    blasint i;
+    inc_x *= 2;
+
+    for (i = 0; i < n; i++)
+    {
+        x_ptr[1] *= (-1.0f);
+        x_ptr +=  inc_x;
+    }
+}
+
+void zconjugate_vector(blasint n, blasint inc_x, double *x_ptr)
+{
+    blasint i;
+    inc_x *= 2;
+
+    for (i = 0; i < n; i++)
+    {
+        x_ptr[1] *= (-1.0);
+        x_ptr +=  inc_x;
+    }
+}
+
+/**
+ * Transpose matrix
+ * 
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param a_src - buffer holding input matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param a_dst - buffer holding output matrix A
+ * param lda_dst - leading dimension of output matrix A
+ */
+void stranspose(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, 
+                float *a_dst, blasint lda_dst)
+{
+    blasint i, j;
+    for (i = 0; i != cols; i++)
+    {
+        for (j = 0; j != rows; j++)
+            a_dst[i*lda_dst+j] = alpha*a_src[j*lda_src+i];
+    }
+}
+
+void dtranspose(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, 
+                double *a_dst, blasint lda_dst)
+{
+    blasint i, j;
+    for (i = 0; i != cols; i++)
+    {
+        for (j = 0; j != rows; j++)
+            a_dst[i*lda_dst+j] = alpha*a_src[j*lda_src+i];
+    }
+}
+
+void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, 
+                float *a_dst, blasint lda_dst, int conj)
+{
+    blasint i, j;
+    lda_dst *= 2;
+    lda_src *= 2;
+    for (i = 0; i != cols*2; i+=2)
+    {
+        for (j = 0; j != rows*2; j+=2){
+            a_dst[(i/2)*lda_dst+j] = alpha[0] * a_src[(j/2)*lda_src+i] + conj * alpha[1] * a_src[(j/2)*lda_src+i+1];
+            a_dst[(i/2)*lda_dst+j+1] = (-1.0f) * conj * alpha[0] * a_src[(j/2)*lda_src+i+1] + alpha[1] * a_src[(j/2)*lda_src+i];
+        } 
+    }
+}
+
+void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, 
+                double *a_dst, blasint lda_dst, int conj)
+{
+    blasint i, j;
+    lda_dst *= 2;
+    lda_src *= 2;
+    for (i = 0; i != cols*2; i+=2)
+    {
+        for (j = 0; j != rows*2; j+=2){
+            a_dst[(i/2)*lda_dst+j] = alpha[0] * a_src[(j/2)*lda_src+i] + conj * alpha[1] * a_src[(j/2)*lda_src+i+1];
+            a_dst[(i/2)*lda_dst+j+1] = (-1.0) * conj * alpha[0] * a_src[(j/2)*lda_src+i+1] + alpha[1] * a_src[(j/2)*lda_src+i];
+        } 
+    }
+}
+
+/**
+ * Copy matrix from source A to destination A
+ * 
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param a_src - buffer holding input matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param a_dst - buffer holding output matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param conj specifies conjugation
+ */
+void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, 
+           float *a_dst, blasint lda_dst)
+{
+    blasint i, j;
+    for (i = 0; i != rows; i++)
+    {
+        for (j = 0; j != cols; j++)
+            a_dst[i*lda_dst+j] = alpha*a_src[i*lda_src+j];
+    }
+}
+
+void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, 
+           double *a_dst, blasint lda_dst)
+{
+    blasint i, j;
+    for (i = 0; i != rows; i++)
+    {
+        for (j = 0; j != cols; j++)
+            a_dst[i*lda_dst+j] = alpha*a_src[i*lda_src+j];
+    }
+}
+
+void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, 
+           float *a_dst, blasint lda_dst, int conj)
+{
+    blasint i, j;
+    lda_dst *= 2;
+    lda_src *= 2;
+    for (i = 0; i != rows; i++)
+    {
+        for (j = 0; j != cols*2; j+=2){
+            a_dst[i*lda_dst+j] = alpha[0] * a_src[i*lda_src+j] + conj * alpha[1] * a_src[i*lda_src+j+1];
+            a_dst[i*lda_dst+j+1] = (-1.0f) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j];
+        }
+    }
+}
+
+void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, 
+           double *a_dst, blasint lda_dst, int conj)
+{
+    blasint i, j;
+    lda_dst *= 2;
+    lda_src *= 2;
+    for (i = 0; i != rows; i++)
+    {
+        for (j = 0; j != cols*2; j+=2){
+            a_dst[i*lda_dst+j] = alpha[0] * a_src[i*lda_src+j] + conj * alpha[1] * a_src[i*lda_src+j+1];
+            a_dst[i*lda_dst+j+1] = (-1.0) * conj *alpha[0] * a_src[i*lda_src+j+1] + alpha[1] * a_src[i*lda_src+j];
+        }
+    }
+}
\ No newline at end of file
diff --git a/utest/test_extensions/common.h b/utest/test_extensions/common.h
new file mode 100644
index 000000000..62b84325c
--- /dev/null
+++ b/utest/test_extensions/common.h
@@ -0,0 +1,76 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#ifndef _TEST_EXTENSION_COMMON_H_
+#define _TEST_EXTENSION_COMMON_H_
+
+#include <cblas.h>
+#include <ctype.h>
+
+#define TRUE 1
+#define FALSE 0
+#define INVALID -1
+#define SINGLE_TOL 1e-02f
+#define DOUBLE_TOL 1e-10
+
+extern int check_error(void);
+extern void set_xerbla(char* current_rout, int expected_info);
+extern int BLASFUNC(xerbla)(char *name, blasint *info, blasint length);
+
+extern void srand_generate(float *alpha, blasint n);
+extern void drand_generate(double *alpha, blasint n);
+
+extern float smatrix_difference(float *a, float *b, blasint cols, blasint rows, blasint ld);
+extern double dmatrix_difference(double *a, double *b, blasint cols, blasint rows, blasint ld);
+
+extern void cconjugate_vector(blasint n, blasint inc_x, float *x_ptr);
+extern void zconjugate_vector(blasint n, blasint inc_x, double *x_ptr);
+
+extern void stranspose(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, 
+                       float *a_dst, blasint lda_dst);
+extern void dtranspose(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, 
+                double *a_dst, blasint lda_dst);
+extern void ctranspose(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, 
+                      float *a_dst, blasint lda_dst, int conj);
+extern void ztranspose(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, 
+                double *a_dst, blasint lda_dst, int conj);
+
+extern void scopy(blasint rows, blasint cols, float alpha, float *a_src, int lda_src, 
+           float *a_dst, blasint lda_dst);
+extern void dcopy(blasint rows, blasint cols, double alpha, double *a_src, int lda_src, 
+           double *a_dst, blasint lda_dst);
+extern void ccopy(blasint rows, blasint cols, float *alpha, float *a_src, int lda_src, 
+           float *a_dst, blasint lda_dst, int conj);
+extern void zcopy(blasint rows, blasint cols, double *alpha, double *a_src, int lda_src, 
+           double *a_dst, blasint lda_dst, int conj);                
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_caxpby.c b/utest/test_extensions/test_caxpby.c
new file mode 100644
index 000000000..221a48ac7
--- /dev/null
+++ b/utest/test_extensions/test_caxpby.c
@@ -0,0 +1,631 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CAXPBY {
+    float x_test[DATASIZE * INCREMENT * 2];
+    float x_verify[DATASIZE * INCREMENT * 2];
+    float y_test[DATASIZE * INCREMENT * 2];
+    float y_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CAXPBY data_caxpby;
+
+/**
+ * Fortran API specific function
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float check_caxpby(blasint n, float *alpha, blasint incx, float *beta, blasint incy)
+{
+    blasint i;
+
+    // cscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    srand_generate(data_caxpby.x_test, n * incx_abs * 2);
+    srand_generate(data_caxpby.y_test, n * incy_abs * 2);
+
+    // Copy vector x for caxpy
+    for (i = 0; i < n * incx_abs * 2; i++)
+        data_caxpby.x_verify[i] = data_caxpby.x_test[i];
+
+    // Copy vector y for cscal
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_caxpby.y_verify[i] = data_caxpby.y_test[i];
+
+    // Find beta*y
+    BLASFUNC(cscal)(&n, beta, data_caxpby.y_verify, &incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    BLASFUNC(caxpy)(&n, alpha, data_caxpby.x_verify, &incx,
+                        data_caxpby.y_verify, &incy);
+    
+    BLASFUNC(caxpby)(&n, alpha, data_caxpby.x_test, &incx,
+                        beta, data_caxpby.y_test, &incy);
+
+    // Find the differences between output vector caculated by caxpby and caxpy
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_caxpby.y_test[i] -= data_caxpby.y_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(scnrm2)(&n, data_caxpby.y_test, &incy_abs);
+}
+
+/**
+ * C API specific function 
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float c_api_check_caxpby(blasint n, float *alpha, blasint incx, float *beta, blasint incy)
+{
+    blasint i;
+
+    // cscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    srand_generate(data_caxpby.x_test, n * incx_abs * 2);
+    srand_generate(data_caxpby.y_test, n * incy_abs * 2);
+
+    // Copy vector x for caxpy
+    for (i = 0; i < n * incx_abs * 2; i++)
+        data_caxpby.x_verify[i] = data_caxpby.x_test[i];
+
+    // Copy vector y for cscal
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_caxpby.y_verify[i] = data_caxpby.y_test[i];
+
+    // Find beta*y
+    cblas_cscal(n, beta, data_caxpby.y_verify, incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    cblas_caxpy(n, alpha, data_caxpby.x_verify, incx,
+                        data_caxpby.y_verify, incy);
+    
+    cblas_caxpby(n, alpha, data_caxpby.x_test, incx,
+                        beta, data_caxpby.y_test, incy);
+
+    // Find the differences between output vector caculated by caxpby and caxpy
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_caxpby.y_test[i] -= data_caxpby.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_scnrm2(n, data_caxpby.y_test, incy_abs);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(caxpby, inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(caxpby, inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha[] = {3.0f, 1.0f};
+    float beta[] = {4.0f, 3.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    float alpha[] = {5.0f, 2.2f};
+    float beta[] = {4.0f, 5.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(caxpby, inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {6.0f, 3.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(caxpby, inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    float alpha[] = {7.0f, 2.0f};
+    float beta[] = {3.5f, 1.3f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(caxpby, inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(caxpby, inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(caxpby, inc_x_1_inc_y_1_N_100_a_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(caxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(caxpby, check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_caxpby(n, alpha, incx, beta, incy);
+    
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(caxpby, c_api_inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 2.1f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, c_api_inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha[] = {3.0f, 2.0f};
+    float beta[] = {4.0f, 3.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(caxpby, c_api_inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    float alpha[] = {5.0f, 2.0f};
+    float beta[] = {4.0f, 3.1f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(caxpby, c_api_inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {6.0f, 2.3f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(caxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {3.5f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(caxpby, c_api_inc_x_1_inc_y_1_N_100_a_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test caxpby by comparing it with cscal and caxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(caxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(caxpby, c_api_check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = c_api_check_caxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
diff --git a/utest/test_extensions/test_caxpyc.c b/utest/test_extensions/test_caxpyc.c
new file mode 100644
index 000000000..ed1899e57
--- /dev/null
+++ b/utest/test_extensions/test_caxpyc.c
@@ -0,0 +1,158 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CAXPYC {
+    float x_test[DATASIZE * INCREMENT * 2];
+    float x_verify[DATASIZE * INCREMENT * 2];
+    float y_test[DATASIZE * INCREMENT * 2];
+    float y_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CAXPYC data_caxpyc;
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Compare with the following options:
+ *
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float check_caxpyc(blasint n, float *alpha, blasint incx, blasint incy)
+{
+    blasint i;
+
+    srand_generate(data_caxpyc.x_test, n * incx * 2);
+    srand_generate(data_caxpyc.y_test, n * incy * 2);
+
+    for (i = 0; i < n * incx * 2; i++)
+        data_caxpyc.x_verify[i] = data_caxpyc.x_test[i];
+
+    for (i = 0; i < n * incy * 2; i++)
+        data_caxpyc.y_verify[i] = data_caxpyc.y_test[i];
+
+    cconjugate_vector(n, incx, data_caxpyc.x_verify);
+
+    BLASFUNC(caxpy)(&n, alpha, data_caxpyc.x_verify, &incx,
+                    data_caxpyc.y_verify, &incy);
+
+    BLASFUNC(caxpyc)(&n, alpha, data_caxpyc.x_test, &incx,
+                     data_caxpyc.y_test, &incy);
+
+    for (i = 0; i < n * incy * 2; i++)
+        data_caxpyc.y_verify[i] -= data_caxpyc.y_test[i];
+
+    return BLASFUNC(scnrm2)(&n, data_caxpyc.y_verify, &incy);
+}
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(caxpyc, conj_strides_one)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha[] = {5.0f, 2.2f};
+
+    float norm = check_caxpyc(n, alpha, incx, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(caxpyc, conj_incx_one)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha[] = {5.0f, 2.2f};
+
+    float norm = check_caxpyc(n, alpha, incx, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(caxpyc, conj_incy_one)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha[] = {5.0f, 2.2f};
+
+    float norm = check_caxpyc(n, alpha, incx, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test caxpyc by conjugating vector x and comparing with caxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(caxpyc, conj_strides_two)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha[] = {5.0f, 2.2f};
+
+    float norm = check_caxpyc(n, alpha, incx, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
diff --git a/utest/test_extensions/test_cgbmv.c b/utest/test_extensions/test_cgbmv.c
new file mode 100644
index 000000000..8e0640c5d
--- /dev/null
+++ b/utest/test_extensions/test_cgbmv.c
@@ -0,0 +1,279 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 1
+
+struct DATA_CGBMV {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float a_band_storage[DATASIZE * DATASIZE * 2];
+    float matrix[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * 2 * INCREMENT];
+    float c_test[DATASIZE * 2 * INCREMENT];
+    float c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGBMV data_cgbmv;
+
+/** 
+ * Transform full-storage band matrix A to band-packed storage mode.
+ * 
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * output param a - buffer for holding band-packed matrix
+ * param lda - specifies the leading dimension of a
+ * param matrix - buffer holding full-storage band matrix A 
+ * param ldm - specifies the leading full-storage band matrix A
+ */
+static void transform_to_band_storage(blasint m, blasint n, blasint kl, 
+                                      blasint ku, float* a, blasint lda,
+                                      float* matrix, blasint ldm)
+{
+    blasint i, j, k;
+    for (j = 0; j < n; j++) 
+    {
+        k = 2 * (ku - j);
+        for (i = MAX(0, 2*(j - ku)); i < MIN(m, j + kl + 1) * 2; i+=2) 
+        {
+            a[(k + i) + j * lda * 2] = matrix[i + j * ldm * 2];
+            a[(k + i) + j * lda * 2 + 1] = matrix[i + j * ldm * 2 + 1];
+        }
+    }
+}
+
+/** 
+ * Generate full-storage band matrix A with kl sub-diagonals and ku super-diagonals
+ * 
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * output param band_matrix - buffer for full-storage band matrix.
+ * param matrix - buffer holding input general matrix
+ * param ldm - specifies the leading of input general matrix
+*/
+static void get_band_matrix(blasint m, blasint n, blasint kl, blasint ku, 
+                            float *band_matrix, float *matrix, blasint ldm)
+{
+    blasint i, j;
+    blasint k = 0;
+    for (i = 0; i < n; i++)
+    {
+        for (j = 0; j < m * 2; j += 2)
+        {
+            if ((blasint)(j/2) > kl + i || i > ku + (blasint)(j/2)) 
+            {
+                band_matrix[i * ldm * 2 + j] = 0.0f;
+                band_matrix[i * ldm * 2 + j + 1] = 0.0f;
+                continue;
+            }
+
+            band_matrix[i * ldm * 2 + j] = matrix[k++];
+            band_matrix[i * ldm * 2 + j + 1] = matrix[k++];
+        }
+    }
+}
+
+/**
+ * Comapare results computed by cgbmv and cgemv 
+ * since gbmv is gemv for band matrix
+ * 
+ * param trans specifies op(A), the transposition operation applied to A
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * param alpha - scaling factor for the matrix-vector product
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences 
+ */
+static float check_cgbmv(char trans, blasint m, blasint n, blasint kl, blasint ku,
+    float *alpha, blasint lda, blasint inc_b, float *beta, blasint inc_c)
+{
+    blasint i;
+    blasint lenb, lenc;
+    
+    if(trans == 'T' || trans == 'C' || trans == 'D' || trans == 'U'){
+        lenb = m;
+        lenc = n;
+    } else {
+        lenb = n;
+        lenc = m;
+    }
+    
+    srand_generate(data_cgbmv.matrix, m * n * 2);
+    srand_generate(data_cgbmv.b_test, 2 * (1 + (lenb - 1) * inc_b));
+    srand_generate(data_cgbmv.c_test, 2 * (1 + (lenc - 1) * inc_c));
+
+    for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++)
+        data_cgbmv.c_verify[i] = data_cgbmv.c_test[i];
+
+    get_band_matrix(m, n, kl, ku, data_cgbmv.a_test, data_cgbmv.matrix, m);
+
+    transform_to_band_storage(m, n, kl, ku, data_cgbmv.a_band_storage, lda, data_cgbmv.a_test, m);
+
+    BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgbmv.a_test, &m, data_cgbmv.b_test,
+                    &inc_b, beta, data_cgbmv.c_verify, &inc_c);
+
+    BLASFUNC(cgbmv)(&trans, &m, &n, &kl, &ku, alpha, data_cgbmv.a_band_storage, &lda, data_cgbmv.b_test, 
+                    &inc_b, beta, data_cgbmv.c_test, &inc_c);
+
+    for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++)
+        data_cgbmv.c_verify[i] -= data_cgbmv.c_test[i];
+
+    return BLASFUNC(scnrm2)(&lenc, data_cgbmv.c_verify, &inc_c);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is D
+ */
+CTEST(cgbmv, trans_D)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'D';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is O
+ */
+CTEST(cgbmv, trans_O)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 10;
+    blasint lda = 50;
+    char trans = 'O';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is S
+ */
+CTEST(cgbmv, trans_S)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 6, ku = 9;
+    blasint lda = 50;
+    char trans = 'S';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is U
+ */
+CTEST(cgbmv, trans_U)
+{
+    blasint m = 25, n = 50;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 7, ku = 11;
+    blasint lda = kl + ku + 1;
+    char trans = 'U';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is C
+ */
+CTEST(cgbmv, trans_C)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'C';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgbmv by comparing it against cgemv
+ * with param trans is R
+ */
+CTEST(cgbmv, trans_R)
+{
+    blasint m = 50, n = 100;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'R';
+
+    float alpha[] = {7.0f, 1.0f};
+    float beta[] = {1.5f, -1.5f};
+
+    float norm = check_cgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+#endif
diff --git a/utest/test_extensions/test_cgeadd.c b/utest/test_extensions/test_cgeadd.c
new file mode 100644
index 000000000..9b87ad9f3
--- /dev/null
+++ b/utest/test_extensions/test_cgeadd.c
@@ -0,0 +1,880 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+
+struct DATA_CGEADD {
+    float a_test[M * N * 2];
+    float c_test[M * N * 2];
+    float c_verify[M * N * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGEADD data_cgeadd;
+
+/**
+ * cgeadd reference implementation
+ *
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param aptr - refer to matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param cptr - refer to matrix C
+ * param ldc - leading dimension of C
+ */
+static void cgeadd_trusted(blasint m, blasint n, float *alpha, float *aptr,
+                           blasint lda, float *beta, float *cptr, blasint ldc)
+{
+    blasint i;
+
+    lda *= 2; 
+	ldc *= 2;
+
+    for (i = 0; i < n; i++)
+    {
+        cblas_caxpby(m, alpha, aptr, 1, beta, cptr, 1);
+        aptr += lda;
+        cptr += ldc;
+    }
+}
+
+/**
+ * Test cgeadd by comparing it against reference
+ * Compare with the following options:
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static float check_cgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                          blasint m, blasint n, float *alpha, blasint lda,
+                          float *beta, blasint ldc)
+{
+    blasint i;
+    blasint cols = m, rows = n;
+
+    if (order == CblasRowMajor)
+    {
+        rows = m;
+        cols = n;
+    }
+
+    // Fill matrix A, C
+    srand_generate(data_cgeadd.a_test, lda * rows * 2);
+    srand_generate(data_cgeadd.c_test, ldc * rows * 2);
+
+    // Copy matrix C for cgeadd
+    for (i = 0; i < ldc * rows * 2; i++)
+        data_cgeadd.c_verify[i] = data_cgeadd.c_test[i];
+
+    cgeadd_trusted(cols, rows, alpha, data_cgeadd.a_test, lda,
+                   beta, data_cgeadd.c_verify, ldc);
+
+    if (api == 'F')
+        BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda,
+                         beta, data_cgeadd.c_test, &ldc);
+    else
+        cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda,
+                     beta, data_cgeadd.c_test, ldc);
+
+    // Find the differences between output matrix caculated by cgeadd and sgemm
+    return smatrix_difference(data_cgeadd.c_test, data_cgeadd.c_verify, cols, rows, ldc*2);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param lda - leading dimension of A
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in cgeadd
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                         blasint m, blasint n, blasint lda,
+                         blasint ldc, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    set_xerbla("CGEADD ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(cgeadd)(&m, &n, alpha, data_cgeadd.a_test, &lda,
+                         beta, data_cgeadd.c_test, &ldc);
+    else 
+        cblas_cgeadd(order, m, n, alpha, data_cgeadd.a_test, lda,
+                 beta, data_cgeadd.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(cgeadd, matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {3.0f, 2.0f};
+    float beta[] = {1.0f, 3.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(cgeadd, matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.5f, 1.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(cgeadd, matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {3.0f, 1.5f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(cgeadd, matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(cgeadd, matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C
+ * Must be at least zero.
+ */
+CTEST(cgeadd, xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific tests
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ */
+CTEST(cgeadd, xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ */
+CTEST(cgeadd, xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ */
+CTEST(cgeadd, xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - number of columns of A, C equal zero.
+ */
+CTEST(cgeadd, n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if m - number of rows of A and C equal zero.
+ */
+CTEST(cgeadd, m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 3.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {4.0f, 1.5f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(cgeadd, c_api_matrix_n_50_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N / 2;
+    blasint m = M;
+
+    blasint lda = n;
+    blasint ldc = n;
+
+    float alpha[] = {3.0f, 2.5f};
+    float beta[] = {1.0f, 2.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {3.0f, 1.5f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgeadd by comparing it against sgemm
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(cgeadd, c_api_matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha[] = {2.0f, 3.0f};
+    float beta[] = {2.0f, 4.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order -
+ * specifies whether A and C stored in
+ * row-major order or column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_invalid_order)
+{
+    CBLAS_ORDER order = INVALID;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 0;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(cgeadd, c_api_xerbla_n_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(cgeadd, c_api_xerbla_m_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(cgeadd, c_api_xerbla_lda_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(cgeadd, c_api_xerbla_ldc_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Check if n - number of columns of A, C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if m - number of rows of A and C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(cgeadd, c_api_m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cgemm.c b/utest/test_extensions/test_cgemm.c
new file mode 100644
index 000000000..cd38d710b
--- /dev/null
+++ b/utest/test_extensions/test_cgemm.c
@@ -0,0 +1,273 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CGEMM {
+	float a_test[DATASIZE * DATASIZE * 2];
+    float a_verify[DATASIZE * DATASIZE * 2];
+	float b_test[DATASIZE * DATASIZE * 2];
+    float b_verify[DATASIZE * DATASIZE * 2];
+    float c_test[DATASIZE * DATASIZE * 2];
+	float c_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGEMM data_cgemm;
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices
+ * and comparing it with the non-conjugate cgemm.
+ *
+ * param transa specifies op(A), the transposition (conjugation) operation applied to A
+ * param transb specifies op(B), the transposition (conjugation) operation applied to B
+ * param m specifies the number of rows of the matrix op(A) and of the matrix C
+ * param n specifies the number of columns of the matrix op(B) and the number of columns of the matrix C
+ * param k specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of matrix A
+ * param ldb - leading dimension of matrix B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of matrix C
+ * return norm of difference
+ */
+static float check_cgemm(char transa, char transb, blasint m, blasint n, blasint k, 
+                         float *alpha, blasint lda, blasint ldb, float *beta, blasint ldc)
+{
+	blasint i;
+	float alpha_conj[] = {1.0f, 0.0f}; 
+	char transa_verify = transa;
+    char transb_verify = transb;
+
+    int arows = k, acols = m;
+    int brows = n, bcols = k;
+
+    if (transa == 'T' || transa == 'C'){
+        arows = m; acols = k;
+    }
+
+    if (transb == 'T' || transb == 'C'){
+        brows = k; bcols = n;
+    }
+
+	srand_generate(data_cgemm.a_test, arows * lda * 2);
+	srand_generate(data_cgemm.b_test, brows * ldb * 2);
+    srand_generate(data_cgemm.c_test, n * ldc * 2);
+
+	for (i = 0; i < arows * lda * 2; i++)
+		data_cgemm.a_verify[i] = data_cgemm.a_test[i];
+
+	for (i = 0; i < brows * ldb * 2; i++)
+		data_cgemm.b_verify[i] = data_cgemm.b_test[i];
+
+    for (i = 0; i < n * ldc * 2; i++)
+		data_cgemm.c_verify[i] = data_cgemm.c_test[i];
+
+	if (transa == 'R'){
+		cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_cgemm.a_verify, lda, lda);
+		transa_verify = 'N';
+	}
+
+    if (transb == 'R'){
+		cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_cgemm.b_verify, ldb, ldb);
+		transb_verify = 'N';
+	}
+
+	BLASFUNC(cgemm)(&transa_verify, &transb_verify, &m, &n, &k, alpha, data_cgemm.a_verify, &lda,
+	 				data_cgemm.b_verify, &ldb, beta, data_cgemm.c_verify, &ldc);
+
+	BLASFUNC(cgemm)(&transa, &transb, &m, &n, &k, alpha, data_cgemm.a_test, &lda,
+	 				data_cgemm.b_test, &ldb, beta, data_cgemm.c_test, &ldc);
+
+	return smatrix_difference(data_cgemm.c_test, data_cgemm.c_verify, m, n, ldc*2);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(cgemm, conjtransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'C';
+	char transb = 'R';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is not conjugate and not transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(cgemm, notransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'N';
+	char transb = 'R';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is conjugate and transposed
+ */
+CTEST(cgemm, conjnotransa_conjtransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'C';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is not conjugate and not transposed
+ */
+CTEST(cgemm, conjnotransa_notransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'N';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(cgemm, conjnotransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'R';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is transposed
+ */
+CTEST(cgemm, conjnotransa_transb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'T';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test cgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate cgemm.
+ * Test with the following options:
+ *
+ * matrix A is transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(cgemm, transa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'T';
+	char transb = 'R';
+	float alpha[] = {-2.0, 1.0f};
+    float beta[] = {1.0f, -1.0f};
+
+	float norm = check_cgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cgemmt.c b/utest/test_extensions/test_cgemmt.c
new file mode 100644
index 000000000..ed9279933
--- /dev/null
+++ b/utest/test_extensions/test_cgemmt.c
@@ -0,0 +1,2010 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_CGEMMT {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * DATASIZE * 2];
+    float c_test[DATASIZE * DATASIZE * 2];
+    float c_verify[DATASIZE * DATASIZE * 2];
+    float c_gemm[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGEMMT data_cgemmt;
+
+/**
+ * Compute gemmt via gemm since gemmt is gemm but updates only 
+ * the upper or lower triangular part of the result matrix
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ */
+static void cgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                           char transb, blasint m, blasint k, float *alpha, blasint lda, 
+                           blasint ldb, float *beta, blasint ldc)
+{
+    blasint i, j;
+
+    if(api == 'F')
+        BLASFUNC(cgemm)(&transa, &transb, &m, &m, &k, alpha, data_cgemmt.a_test, &lda,
+                        data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_gemm, &ldc);
+    else
+        cblas_cgemm(order, transa, transb, m, m, k, alpha, data_cgemmt.a_test, lda,
+                data_cgemmt.b_test, ldb, beta, data_cgemmt.c_gemm, ldc);
+
+    ldc *= 2;
+
+    if (uplo == 'L' || uplo == CblasLower)
+    {
+        for (i = 0; i < m; i++)
+            for (j = i * 2; j < m * 2; j+=2){
+                data_cgemmt.c_verify[i * ldc + j] =
+                    data_cgemmt.c_gemm[i * ldc + j];
+                data_cgemmt.c_verify[i * ldc + j + 1] =
+                    data_cgemmt.c_gemm[i * ldc + j + 1];
+            }
+    } else {
+        for (i = 0; i < m; i++)
+            for (j = 0; j <= i * 2; j+=2){
+                data_cgemmt.c_verify[i * ldc + j] =
+                    data_cgemmt.c_gemm[i * ldc + j];
+                data_cgemmt.c_verify[i * ldc + j + 1] =
+                    data_cgemmt.c_gemm[i * ldc + j + 1];
+            }
+    }
+}
+
+/**
+ * Comapare results computed by cgemmt and cgemmt_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static float check_cgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                          char transb, blasint m, blasint k, float *alpha, blasint lda, 
+                          blasint ldb, float *beta, blasint ldc)
+{
+    blasint i;
+    blasint b_cols;
+    blasint a_cols;
+    blasint inc = 1;
+    blasint size_c = m * ldc * 2;
+
+    if(order == CblasColMajor){
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = m;
+        else a_cols = k;
+
+        if (transb == 'T' || transb == 'C' || 
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = k;
+        else b_cols = m;
+    } else {
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = k;
+        else a_cols = m;
+
+        if (transb == 'T' || transb == 'C' ||
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = m;
+        else b_cols = k;
+    }
+
+    srand_generate(data_cgemmt.a_test, a_cols * lda * 2);
+    srand_generate(data_cgemmt.b_test, b_cols * ldb  * 2);
+    srand_generate(data_cgemmt.c_test, m * ldc * 2);
+
+    for (i = 0; i < m * ldc * 2; i++)
+        data_cgemmt.c_gemm[i] = data_cgemmt.c_verify[i] = data_cgemmt.c_test[i];
+
+    cgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc);
+
+    if (api == 'F')
+        BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test,
+                         &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc);
+    else
+        cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda,
+                     data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc);
+
+    for (i = 0; i < m * ldc * 2; i++)
+        data_cgemmt.c_verify[i] -= data_cgemmt.c_test[i];
+
+    return BLASFUNC(snrm2)(&size_c, data_cgemmt.c_verify, &inc) / size_c;
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in cgemmt
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                         char transb, blasint m, blasint k, blasint lda, blasint ldb,
+                         blasint ldc, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    set_xerbla("CGEMMT ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(cgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_cgemmt.a_test,
+                         &lda, data_cgemmt.b_test, &ldb, beta, data_cgemmt.c_test, &ldc);
+    else
+        cblas_cgemmt(order, uplo, transa, transb, m, k, alpha, data_cgemmt.a_test, lda,
+                     data_cgemmt.b_test, ldb, beta, data_cgemmt.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    char transa = 'N', transb = 'T';
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'U';
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    char transa = 'R', transb = 'R';
+    char uplo = 'U';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'R';
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'R', transb = 'C';
+    char uplo = 'U';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, upper_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'C';
+    char uplo = 'U';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    char transa = 'N', transb = 'T';
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'L';
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    char transa = 'R', transb = 'R';
+    char uplo = 'L';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'R';
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'R', transb = 'C';
+    char uplo = 'L';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'C';
+    char uplo = 'L';
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_colmajor_upper_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, c_api_colmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, c_api_colmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_colmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, c_api_colmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, c_api_colmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 25;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {-1.0f, -1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 25, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjtrans_b_conjtrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, c_api_rowmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 25;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {-1.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 25, ldc = 25;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 25, ldc = 50;
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {-1.0f, 2.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {0.0f, 0.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(cgemmt, c_api_rowmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param uplo.
+ * Must be upper (U) or lower (L).
+ */
+CTEST(cgemmt, xerbla_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transa.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(cgemmt, xerbla_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'O', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transb.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(cgemmt, xerbla_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'O';
+    char uplo = 'U';
+    int expected_info = 3;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 4;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 5;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(cgemmt, xerbla_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 8;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(cgemmt, xerbla_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 10;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(cgemmt, xerbla_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 13;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test.
+ * Test error function for an invalid param order.
+ * Must be column or row major.
+ */
+CTEST(cgemmt, xerbla_c_api_major_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 0;
+
+    int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(cgemmt, xerbla_c_api_colmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B transposed.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(cgemmt, xerbla_c_api_rowmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cgemv_n.c b/utest/test_extensions/test_cgemv_n.c
new file mode 100644
index 000000000..60c9af86a
--- /dev/null
+++ b/utest/test_extensions/test_cgemv_n.c
@@ -0,0 +1,340 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CSPMV_N {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * 2 * INCREMENT];
+    float c_test[DATASIZE * 2 * INCREMENT];
+    float c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CSPMV_N data_cgemv_n;
+
+/**
+ * cgemv not transposed reference code
+ *
+ * param trans specifies whether matris A is conj or/and xconj
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param alpha - scaling factor for the matrib-vector product
+ * param a - buffer holding input matrib A
+ * param lda - leading dimension of matrix A
+ * param b - Buffer holding input vector b
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param c - buffer holding input/output vector c
+ * param inc_c - stride of vector c
+ */
+static void cgemv_n_trusted(char trans, blasint m, blasint n, float *alpha, float *a,
+                          blasint lda, float *b, blasint inc_b, float *beta, float *c,
+                          blasint inc_c)
+{
+	blasint i, j;
+    blasint i2 = 0;
+	blasint ib = 0, ic = 0;
+
+    float temp_r, temp_i;
+
+	float *a_ptr = a;
+    blasint lda2 = 2*lda;
+
+	blasint inc_b2 = 2 * inc_b;
+    blasint inc_c2 = 2 * inc_c;
+
+    BLASFUNC(cscal)(&m, beta, c, &inc_c);
+
+	for (j = 0; j < n; j++)
+	{
+
+        if (trans == 'N' || trans == 'R') {
+            temp_r = alpha[0] * b[ib] - alpha[1] * b[ib+1];
+            temp_i = alpha[0] * b[ib+1] + alpha[1] * b[ib];
+        } else {
+            temp_r = alpha[0] * b[ib] + alpha[1] * b[ib+1];
+            temp_i = alpha[0] * b[ib+1] - alpha[1] * b[ib];
+        }
+
+		ic = 0;
+		i2 = 0;
+
+		for (i = 0; i < m; i++)
+		{
+                if (trans == 'N') {
+                    c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
+                    c[ic+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+                } 
+                if (trans == 'O') {
+                    c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
+                    c[ic+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+                }
+                if (trans == 'R') {
+                    c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
+                    c[ic+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+                }
+                if (trans == 'S') {
+                    c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
+                    c[ic+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+                }
+			i2 += 2;
+			ic += inc_c2;
+		}
+		a_ptr += lda2;
+		ib += inc_b2;
+	}
+
+}
+
+/**
+ * Comapare results computed by cgemv and cgemv_n_trusted
+ *
+ * param trans specifies whether matris A is conj or/and xconj
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param alpha - scaling factor for the matrib-vector product
+ * param lda - leading dimension of matrix A
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences
+ */
+static float check_cgemv_n(char trans, blasint m, blasint n, float *alpha, blasint lda, 
+                            blasint inc_b, float *beta, blasint inc_c)
+{
+    blasint i;
+
+    srand_generate(data_cgemv_n.a_test, n * lda);
+    srand_generate(data_cgemv_n.b_test, 2 * n * inc_b);
+    srand_generate(data_cgemv_n.c_test, 2 * m * inc_c);
+
+    for (i = 0; i < m * 2 * inc_c; i++)
+        data_cgemv_n.c_verify[i] = data_cgemv_n.c_test[i];
+
+    cgemv_n_trusted(trans, m, n, alpha, data_cgemv_n.a_test, lda, data_cgemv_n.b_test, 
+                  inc_b, beta, data_cgemv_n.c_test, inc_c);
+    BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_n.a_test, &lda, data_cgemv_n.b_test, 
+                    &inc_b, beta, data_cgemv_n.c_verify, &inc_c);
+
+    for (i = 0; i < m * 2 * inc_c; i++)
+        data_cgemv_n.c_verify[i] -= data_cgemv_n.c_test[i];
+
+    return BLASFUNC(scnrm2)(&n, data_cgemv_n.c_verify, &inc_c);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_o_square_matrix)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows of A is 50
+ * Number of colums of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_o_rectangular_matrix_rows_less_then_cols)
+{
+    blasint n = 100, m = 50, lda = 50;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows of A is 100
+ * Number of colums of A is 50
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_o_rectangular_matrix_cols_less_then_rows)
+{
+    blasint n = 50, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(cgemv, trans_o_double_strides)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 2, inc_c = 2;
+    char trans = 'O';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_s_square_matrix)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows of A is 50
+ * Number of colums of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_s_rectangular_matrix_rows_less_then_cols)
+{
+    blasint n = 100, m = 50, lda = 50;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows of A is 100
+ * Number of colums of A is 50
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cgemv, trans_s_rectangular_matrix_cols_less_then_rows)
+{
+    blasint n = 50, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.4f, 0.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(cgemv, trans_s_double_strides)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 2, inc_c = 2;
+    char trans = 'S';
+    float alpha[] = {2.0f, -1.0f};
+    float beta[] = {1.0f, 5.0f};
+
+    float norm = check_cgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+#endif
diff --git a/utest/test_extensions/test_cgemv_t.c b/utest/test_extensions/test_cgemv_t.c
new file mode 100644
index 000000000..cb4e5ad9e
--- /dev/null
+++ b/utest/test_extensions/test_cgemv_t.c
@@ -0,0 +1,1132 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+#define INCREMENT 2
+
+struct DATA_CGEMV_T {
+    float a_test[N * M * 2];
+    float a_verify[N * M * 2];
+    float y_test[M * INCREMENT * 2];
+    float y_verify[M * INCREMENT * 2];
+    float x_test[M * INCREMENT * 2];
+    float x_verify[M * INCREMENT * 2];
+};
+
+// SINGLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * FLT_EPSILON
+// SINGLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 1.19e-07 = 5*e-03
+#define SINGLE_EPS_ZGEMV 5e-03
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CGEMV_T data_cgemv_t;
+
+/**
+ * Find product of matrix-vector multiplication
+ * 
+ * param n specifies number of columns of A
+ * param m specifies number of rows of A and size of vector x
+ * param lda specifies leading dimension of A
+ * param inc_x specifies increment of vector x
+ */
+static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x)
+{
+    blasint i;
+    float *a_ptr = data_cgemv_t.a_verify;
+    float *x_ptr = data_cgemv_t.x_test;
+    float *x_res = data_cgemv_t.x_verify;
+
+    openblas_complex_float result;
+
+    for (i = 0; i < n * inc_x; i+= inc_x)
+    {
+        result = cblas_cdotu(lda, a_ptr, 1, x_ptr, inc_x);
+        x_res[0] = CREAL(result);
+        x_res[1] = CIMAG(result);
+        a_ptr += lda * 2;
+        x_res += 2 * inc_x;
+    }
+}
+
+/**
+ * Test cgemv by comparing it against comatcopy, caxpby and
+ * reference func matrix_vector_product
+ * 
+ * comatcopy perform operation: op(A)
+ * matrix_vector_product perform operation: A*x
+ * caxpby perform operation: alpha*x + beta*y
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param alpha specifies scalar alpha
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param beta specifies scalar beta
+ * param inc_y specifies increment for vector y
+ * return norm of difference between cgemv and result of reference funcs
+ */
+static float check_cgemv(char api, char order, char trans, blasint m, blasint n, float *alpha, 
+                            blasint lda, blasint inc_x, float *beta, blasint inc_y)
+{
+    blasint i;
+
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    // Transpose parameters for comatcopy
+    // cgemv_t perform operation on transposed matrix, no need to transpose a_verify
+    char trans_copy;
+    char ctrans_copy;
+
+    // Param alpha for comatcopy, scale on alpha perform caxpby
+    float alpha_one[] = {1.0f, 0.0f};
+
+    memset(data_cgemv_t.x_verify, 0.0f, m * inc_x * 2 * sizeof(float));
+
+    // Fill matrix A, vectors x, y    
+    srand_generate(data_cgemv_t.a_test, lda * n * 2);
+    srand_generate(data_cgemv_t.x_test, m * inc_x * 2);
+    srand_generate(data_cgemv_t.y_test, m * inc_y * 2);
+
+    // Copy vector y for reference funcs
+    for (i = 0; i < m * inc_y * 2; i++) {
+        data_cgemv_t.y_verify[i] = data_cgemv_t.y_test[i];
+    }
+
+    if (api == 'F') {
+        if (trans == 'T') trans_copy = 'N';
+        if (trans == 'C') trans_copy = 'R';
+        if (trans == 'U') trans_copy = 'R';
+        if (trans == 'D') trans_copy = 'N';
+
+        // Perform operation: op(A)
+        BLASFUNC(comatcopy)(&order, &trans_copy, &m, &n, alpha_one, data_cgemv_t.a_test, &lda, data_cgemv_t.a_verify, &lda);
+
+        // Find A*x
+        matrix_vector_product(n, m, lda, inc_x);
+
+        // Find conj(x)
+        if (trans == 'U' || trans == 'D') {
+            cconjugate_vector(m, inc_x, data_cgemv_t.x_verify);
+        }
+
+        // Find alpha*x+beta*y
+        BLASFUNC(caxpby)(&n, alpha, data_cgemv_t.x_verify, &inc_x, beta, data_cgemv_t.y_verify, &inc_y);
+
+        BLASFUNC(cgemv)(&trans, &m, &n, alpha, data_cgemv_t.a_test, 
+                        &lda, data_cgemv_t.x_test, &inc_x, beta, data_cgemv_t.y_test, &inc_y);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') {ctrans = CblasTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasTrans : CblasNoTrans;}
+        if (trans == 'N') {ctrans = CblasNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasNoTrans : CblasTrans;}
+        if (trans == 'C') {ctrans = CblasConjTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjTrans : CblasConjNoTrans;}
+        if (trans == 'R') {ctrans = CblasConjNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjNoTrans : CblasConjTrans;}
+
+        // Perform operation: op(A)
+        cblas_comatcopy(corder, ctrans_copy, m, n, alpha_one, data_cgemv_t.a_test, lda, data_cgemv_t.a_verify, lda);
+
+        // Find A*x
+        matrix_vector_product(n, m, lda, inc_x);
+
+        // Find alpha*x+beta*y
+        cblas_caxpby(n, alpha, data_cgemv_t.x_verify, inc_x, beta, data_cgemv_t.y_verify, inc_y);
+
+        cblas_cgemv(corder, ctrans, m, n, alpha, data_cgemv_t.a_test, 
+                        lda, data_cgemv_t.x_test, inc_x, beta, data_cgemv_t.y_test, inc_y);
+    }
+
+    // Find the differences between output vector caculated by cgemv and reference funcs
+    for (i = 0; i < m * inc_y * 2; i++)
+        data_cgemv_t.y_test[i] -= data_cgemv_t.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_scnrm2(m, data_cgemv_t.y_test, inc_y);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param inc_y specifies increment for vector y
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint m, blasint n, 
+                        blasint lda, blasint inc_x, blasint inc_y, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float a[] = {1.0f, 1.0f};
+    float x[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+    float y[] = {1.0f, 1.0f};
+
+    set_xerbla("CGEMV ", expected_info);
+
+    BLASFUNC(cgemv)(&trans, &m, &n, alpha, a, &lda, x, &inc_x, beta, y, &inc_y);
+
+    return check_error();
+}
+
+/**
+ * C API specific function
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param inc_y specifies increment for vector y
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int c_api_check_badargs(CBLAS_ORDER corder, CBLAS_TRANSPOSE ctrans, blasint m, blasint n, 
+                        blasint lda, blasint inc_x, blasint inc_y, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float a[] = {1.0f, 1.0f};
+    float x[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+    float y[] = {1.0f, 1.0f};
+
+    set_xerbla("CGEMV ", expected_info);
+
+    cblas_cgemv(corder, ctrans, m, n, alpha, a, lda, x, inc_x, beta, y, inc_y);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, colmajor_trans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, colmajor_trans_col_100_row_100_inc_x_2_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {2.0f, 2.0f};
+
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 2
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and x conjugate
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 2.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and x conjugate
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 1.0f, alpha_i = 2.0f
+ * beta_r = 1.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_2_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'U';
+
+    float alpha[] = {1.0f, 2.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+
+/**
+ * Fortran API specific test
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition, conjugate A, conjugate x
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, colmajor_conjtrans_x_conj_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'D';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('F', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition, conjugate A, conjugate x
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, c_api_colmajor_trans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 2
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 2.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 1.0f
+ */
+CTEST(cgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'N';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * No trans
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 3.0f, beta_i = 2.0f
+ */
+CTEST(cgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_2_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'N';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {3.0f, 1.0f};
+
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Conjugate
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 3.0f
+ * beta_r = 1.0f, beta_i = 2.5f
+ */
+CTEST(cgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'R';
+
+    float alpha[] = {1.0f, 3.0f};
+    float beta[] = {1.0f, 2.5f};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test 
+ * Test cgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Conjugate
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0f, alpha_i = 1.0f
+ * beta_r = 1.0f, beta_i = 1.5f
+ */
+CTEST(cgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_2_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'R';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.5f};
+
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    float norm = check_cgemv('C', order, trans, m, n, alpha, lda,
+                             inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_inc_y)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_inc_y_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_inc_y_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_inc_x)
+{
+    char order = 'C';
+    char trans = 'T';
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+    
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_inc_x_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+    
+    int expected_info = 8;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_inc_x_row_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+    
+    int expected_info = 8;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_n)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_n_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+    
+    int expected_info = 3;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_n_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_m)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+    
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_m_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_m_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * lda must be at least n.
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_lda)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda.
+ * If matrices are stored using col major layout, 
+ * lda must be at least m.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_lda_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda.
+ * If matrices are stored using col major layout, 
+ * lda must be at least n.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_lda_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param trans.
+ * 
+ * Column major
+ */
+CTEST(cgemv, xerbla_invalid_trans)
+{
+    char order = 'C';
+    char trans = 'Z';
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param trans.
+ * 
+ * Column major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_trans_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = INVALID;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param trans.
+ * 
+ * Row major
+ */
+CTEST(cgemv, c_api_xerbla_invalid_trans_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = INVALID;
+    
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order. 
+ */
+CTEST(cgemv, c_api_xerbla_invalid_order_col_major)
+{
+    enum CBLAS_ORDER corder = INVALID;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 0;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_cimatcopy.c b/utest/test_extensions/test_cimatcopy.c
new file mode 100644
index 000000000..a4b1e30ac
--- /dev/null
+++ b/utest/test_extensions/test_cimatcopy.c
@@ -0,0 +1,818 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_CIMATCOPY {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float a_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CIMATCOPY data_cimatcopy;
+
+/**
+ * Comapare results computed by cimatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * return norm of difference between openblas and reference func
+ */
+static float check_cimatcopy(char api, char order, char trans, blasint rows, blasint cols, float *alpha, 
+                             blasint lda_src, blasint lda_dst)
+{
+    blasint m, n;
+    blasint rows_out, cols_out;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+    int conj = -1;
+
+    if (order == 'C') {
+        n = rows; m = cols;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        rows_out = n; cols_out = m*2;
+        if (trans == 'C')
+            conj = 1;
+    }
+    else {
+        rows_out = m; cols_out = n*2;
+        if (trans == 'R')
+            conj = 1;
+    }
+
+    srand_generate(data_cimatcopy.a_test, lda_src*m*2);
+
+    if (trans == 'T' || trans == 'C') {
+        ctranspose(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj);
+    } 
+    else {
+        ccopy(m, n, alpha, data_cimatcopy.a_test, lda_src, data_cimatcopy.a_verify, lda_dst, conj);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, 
+                            &lda_src, &lda_dst);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_cimatcopy(corder, ctrans, rows, cols, alpha, data_cimatcopy.a_test, 
+                    lda_src, lda_dst);
+    }
+
+    // Find the differences between output matrix computed by cimatcopy and reference func
+    return smatrix_difference(data_cimatcopy.a_test, data_cimatcopy.a_verify, cols_out, rows_out, 2*lda_dst);    
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda_src, blasint lda_dst, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+
+    set_xerbla("CIMATCOPY", expected_info);
+
+    BLASFUNC(cimatcopy)(&order, &trans, &rows, &cols, alpha, data_cimatcopy.a_test, 
+                        &lda_src, &lda_dst);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha_r = -3.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {-3.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, colmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, colmajor_conjtrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {1.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, colmajor_conj_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 2.0, alpha_i = 3.0
+ */
+CTEST(cimatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha[] = {2.0f, 3.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, rowmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(cimatcopy, rowmajor_conj_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'R';
+    char trans = 'R'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, rowmajor_conjtrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {3.0f, 2.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 1.5
+ */
+CTEST(cimatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {3.0f, 1.5f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {3.0f, 1.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, c_api_colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, c_api_colmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 1.0
+ */
+CTEST(cimatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha[] = {1.0f, 1.0f};
+
+    float norm = check_cimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(cimatcopy, c_api_rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(cimatcopy, c_api_rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_cimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(cimatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(cimatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using row major layout, 
+ * lda_src must be at least n.
+ */
+CTEST(cimatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using column major layout,
+ * lda_src must be at least m.
+ */
+CTEST(cimatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, lda_dst must be at least n.
+ */
+CTEST(cimatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is transposition, lda_dst must be at least m.
+ */
+CTEST(cimatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, lda_dst must be at least m.
+ */
+CTEST(cimatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is transposition, lda_dst must be at least n.
+ */
+CTEST(cimatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_comatcopy.c b/utest/test_extensions/test_comatcopy.c
new file mode 100644
index 000000000..71663406a
--- /dev/null
+++ b/utest/test_extensions/test_comatcopy.c
@@ -0,0 +1,696 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_COMATCOPY {
+    float a_test[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * DATASIZE * 2];
+    float b_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_COMATCOPY data_comatcopy;
+
+/**
+ * Comapare results computed by comatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param alpha - scaling factor for matrix B
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * return norm of difference between openblas and reference func
+ */
+static float check_comatcopy(char api, char order, char trans, blasint rows, blasint cols, float* alpha, 
+                             blasint lda, blasint ldb)
+{
+    blasint b_rows, b_cols;
+    blasint m, n;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+    int conj = -1;
+
+    if (order == 'C') {
+        m = cols; n = rows;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        b_rows = n; b_cols = m*2;
+        if (trans == 'C')
+            conj = 1;
+    }
+    else {
+        b_rows = m; b_cols = n*2;
+        if (trans == 'R')
+            conj = 1;
+    }
+
+    srand_generate(data_comatcopy.a_test, lda*m*2);
+
+    if (trans == 'T' || trans == 'C') {
+        ctranspose(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj);
+    } 
+    else {
+        ccopy(m, n, alpha, data_comatcopy.a_test, lda, data_comatcopy.b_verify, ldb, conj);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, 
+                            &lda, data_comatcopy.b_test, &ldb);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_comatcopy(corder, ctrans, rows, cols, alpha, data_comatcopy.a_test, 
+                    lda, data_comatcopy.b_test, ldb);
+    }
+    
+    return smatrix_difference(data_comatcopy.b_test, data_comatcopy.b_verify, b_cols, b_rows, ldb*2);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda, blasint ldb, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+
+    set_xerbla("COMATCOPY", expected_info);
+
+    BLASFUNC(comatcopy)(&order, &trans, &rows, &cols, alpha, data_comatcopy.a_test, 
+                        &lda, data_comatcopy.b_test, &ldb);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * alpha_r = -1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {-1.0f, 2.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(comatcopy, colmajor_conjtrnas_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, c_api_colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * alpha_r = -1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, c_api_colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha[] = {-1.0f, 2.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, c_api_colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'R';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(comatcopy, c_api_colmajor_conjtrnas_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'C';
+    float alpha[] = {2.0f, 1.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, c_api_rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, c_api_rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(comatcopy, c_api_rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    float alpha[] = {1.5f, -1.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test comatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(comatcopy, c_api_rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'C';
+    float alpha[] = {1.0f, 2.0f};
+
+    float norm = check_comatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(comatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(comatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using row major layout, 
+ * lda must be at least n.
+ */
+CTEST(comatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using column major layout,
+ * lda must be at least m.
+ */
+CTEST(comatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, ldb must be at least n.
+ */
+CTEST(comatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and 
+ * there is transposition, ldb must be at least m.
+ */
+CTEST(comatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, ldb must be at least n.
+ */
+CTEST(comatcopy, xerbla_rowmajor_conj_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'R';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and 
+ * there is transposition, ldb must be at least m.
+ */
+CTEST(comatcopy, xerbla_rowmajor_transconj_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'C';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, ldb must be at least m.
+ */
+CTEST(comatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and 
+ * there is transposition, ldb must be at least n.
+ */
+CTEST(comatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, ldb must be at least m.
+ */
+CTEST(comatcopy, xerbla_colmajor_conj_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'R';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and 
+ * there is transposition, ldb must be at least n.
+ */
+CTEST(comatcopy, xerbla_colmajor_transconj_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'C';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_crot.c b/utest/test_extensions/test_crot.c
new file mode 100644
index 000000000..1c55216d9
--- /dev/null
+++ b/utest/test_extensions/test_crot.c
@@ -0,0 +1,792 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CROT {
+    float x_test[DATASIZE * INCREMENT * 2];
+    float y_test[DATASIZE * INCREMENT * 2];
+    float x_verify[DATASIZE * INCREMENT * 2];
+    float y_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CROT data_crot;
+
+/**
+ * Fortran API specific function
+ * Comapare results computed by csrot and caxpby 
+ * 
+ * param n specifies size of vector x
+ * param inc_x specifies increment of vector x
+ * param inc_y specifies increment of vector y
+ * param c specifies cosine
+ * param s specifies sine
+ * return norm of differences 
+ */
+static float check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, float *s)
+{
+    blasint i;
+    float norm = 0;
+    float s_neg[] = {-s[0], s[1]};
+
+    blasint inc_x_abs = labs(inc_x);
+    blasint inc_y_abs = labs(inc_y);
+
+    // Fill vectors x, y
+    srand_generate(data_crot.x_test, n * inc_x_abs * 2);
+    srand_generate(data_crot.y_test, n * inc_y_abs * 2);
+
+    if (inc_x == 0 && inc_y == 0) {
+        srand_generate(data_crot.x_test, n * 2);
+        srand_generate(data_crot.y_test, n * 2);
+    }
+
+    // Copy vector x for caxpby
+    for (i = 0; i < n * inc_x_abs * 2; i++)
+        data_crot.x_verify[i] = data_crot.x_test[i];
+
+    // Copy vector y for caxpby
+    for (i = 0; i < n * inc_y_abs * 2; i++)
+        data_crot.y_verify[i] = data_crot.y_test[i];
+    
+    // Find cx = c*x + s*y
+    BLASFUNC(caxpby)(&n, s, data_crot.y_test, &inc_y, c, data_crot.x_verify, &inc_x);
+
+    // Find cy = -conjg(s)*x + c*y
+    BLASFUNC(caxpby)(&n, s_neg, data_crot.x_test, &inc_x, c, data_crot.y_verify, &inc_y);
+
+    BLASFUNC(csrot)(&n, data_crot.x_test, &inc_x, data_crot.y_test, &inc_y, c, s);
+
+    // Find the differences between vector x caculated by caxpby and csrot
+    for (i = 0; i < n * 2 * inc_x_abs; i++)
+        data_crot.x_test[i] -= data_crot.x_verify[i];
+
+    // Find the differences between vector y caculated by caxpby and csrot
+    for (i = 0; i < n * 2 * inc_y_abs; i++)
+        data_crot.y_test[i] -= data_crot.y_verify[i];
+
+    // Find the norm of differences
+    norm += BLASFUNC(scnrm2)(&n, data_crot.x_test, &inc_x_abs);
+    norm += BLASFUNC(scnrm2)(&n, data_crot.y_test, &inc_y_abs);
+    return (norm / 2);
+}
+
+/**
+ * C API specific function
+ * Comapare results computed by csrot and caxpby 
+ * 
+ * param n specifies size of vector x
+ * param inc_x specifies increment of vector x
+ * param inc_y specifies increment of vector y
+ * param c specifies cosine
+ * param s specifies sine
+ * return norm of differences 
+ */
+static float c_api_check_csrot(blasint n, blasint inc_x, blasint inc_y, float *c, float *s)
+{
+    blasint i;
+    float norm = 0;
+    float s_neg[] = {-s[0], s[1]};
+
+    blasint inc_x_abs = labs(inc_x);
+    blasint inc_y_abs = labs(inc_y);
+
+    // Fill vectors x, y
+    srand_generate(data_crot.x_test, n * inc_x_abs * 2);
+    srand_generate(data_crot.y_test, n * inc_y_abs * 2);
+
+    if (inc_x == 0 && inc_y == 0) {
+        srand_generate(data_crot.x_test, n * 2);
+        srand_generate(data_crot.y_test, n * 2);
+    }
+
+    // Copy vector x for caxpby
+    for (i = 0; i < n * inc_x_abs * 2; i++)
+        data_crot.x_verify[i] = data_crot.x_test[i];
+
+    // Copy vector y for caxpby
+    for (i = 0; i < n * inc_y_abs * 2; i++)
+        data_crot.y_verify[i] = data_crot.y_test[i];
+    
+    // Find cx = c*x + s*y
+    cblas_caxpby(n, s, data_crot.y_test, inc_y, c, data_crot.x_verify, inc_x);
+
+    // Find cy = -conjg(s)*x + c*y
+    cblas_caxpby(n, s_neg, data_crot.x_test, inc_x, c, data_crot.y_verify, inc_y);
+
+    cblas_csrot(n, data_crot.x_test, inc_x, data_crot.y_test, inc_y, c[0], s[0]);
+
+    // Find the differences between vector x caculated by caxpby and csrot
+    for (i = 0; i < n * 2 * inc_x_abs; i++)
+        data_crot.x_test[i] -= data_crot.x_verify[i];
+
+    // Find the differences between vector y caculated by caxpby and csrot
+    for (i = 0; i < n * 2 * inc_y_abs; i++)
+        data_crot.y_test[i] -= data_crot.y_verify[i];
+
+    // Find the norm of differences
+    norm += cblas_scnrm2(n, data_crot.x_test, inc_x_abs);
+    norm += cblas_scnrm2(n, data_crot.y_test, inc_y_abs);
+    return (norm / 2);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 0
+ * Stride of vector y is 0
+ * c = 1.0f
+ * s = 2.0f
+ */
+CTEST(crot, inc_x_0_inc_y_0)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 0;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_1_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is -1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_neg_1_inc_y_neg_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -1;
+    blasint inc_y = -1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * c = 3.0f
+ * s = 2.0f
+ */
+CTEST(crot, inc_x_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {3.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_neg_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_1_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is -2
+ * c = 2.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_1_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = -2;
+
+    // Imaginary  part for caxpby
+    float c[] = {2.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 2.0f
+ */
+CTEST(crot, inc_x_2_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_neg_2_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = -2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 0.0f
+ * s = 1.0f
+ */
+CTEST(crot, inc_x_2_inc_y_2_c_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {0.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 0.0f
+ */
+CTEST(crot, inc_x_2_inc_y_2_s_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {0.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 0
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, check_n_zero)
+{
+    blasint n = 0;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test 
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 0
+ * Stride of vector y is 0
+ * c = 1.0f
+ * s = 2.0f
+ */
+CTEST(crot, c_api_inc_x_0_inc_y_0)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 0;
+
+    // Imaginary  part for caxpby
+    float c[] = {3.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_1_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is -1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_neg_1_inc_y_neg_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -1;
+    blasint inc_y = -1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * c = 3.0f
+ * s = 2.0f
+ */
+CTEST(crot, c_api_inc_x_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {3.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_neg_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_1_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is -2
+ * c = 2.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_1_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = -2;
+
+    // Imaginary  part for caxpby
+    float c[] = {2.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 2.0f
+ */
+CTEST(crot, c_api_inc_x_2_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {2.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_neg_2_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = -2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 0.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_inc_x_2_inc_y_2_c_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {0.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0f
+ * s = 0.0f
+ */
+CTEST(crot, c_api_inc_x_2_inc_y_2_s_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {0.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crot by comparing it with caxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 0
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0f
+ * s = 1.0f
+ */
+CTEST(crot, c_api_check_n_zero)
+{
+    blasint n = 0;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for caxpby
+    float c[] = {1.0f, 0.0f};
+    float s[] = {1.0f, 0.0f};
+
+    float norm = c_api_check_csrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_crotg.c b/utest/test_extensions/test_crotg.c
new file mode 100644
index 000000000..84875ccf7
--- /dev/null
+++ b/utest/test_extensions/test_crotg.c
@@ -0,0 +1,290 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, zero_a)
+{
+    float sa[2] = {0.0f, 0.0f};
+    float sb[2] = {1.0f, 1.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.70711f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, zero_b)
+{
+    float sa[2] = {1.0f, 1.0f};
+    float sb[2] = {0.0f, 0.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(1.0f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, zero_real)
+{
+    float sa[2] = {0.0f, 1.0f};
+    float sb[2] = {0.0f, 1.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.70711f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, positive_real_positive_img)
+{
+    float sa[2] = {3.0f, 4.0f};
+    float sb[2] = {4.0f, 6.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, negative_real_positive_img)
+{
+    float sa[2] = {-3.0f, 4.0f};
+    float sb[2] = {-4.0f, 6.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, positive_real_negative_img)
+{
+    float sa[2] = {3.0f, -4.0f};
+    float sb[2] = {4.0f, -6.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, negative_real_negative_img)
+{
+    float sa[2] = {-3.0f, -4.0f};
+    float sb[2] = {-4.0f, -6.0f};
+    float ss[2];
+    float sc;
+    BLASFUNC(crotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_zero_a)
+{
+    float sa[2] = {0.0f, 0.0f};
+    float sb[2] = {1.0f, 1.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.0f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.70711f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_zero_b)
+{
+    float sa[2] = {1.0f, 1.0f};
+    float sb[2] = {0.0f, 0.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(1.0f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_zero_real)
+{
+    float sa[2] = {0.0f, 1.0f};
+    float sb[2] = {0.0f, 1.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.70711f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70711f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_positive_real_positive_img)
+{
+    float sa[2] = {3.0f, 4.0f};
+    float sb[2] = {4.0f, 6.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_negative_real_positive_img)
+{
+    float sa[2] = {-3.0f, 4.0f};
+    float sb[2] = {-4.0f, 6.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_positive_real_negative_img)
+{
+    float sa[2] = {3.0f, -4.0f};
+    float sb[2] = {4.0f, -6.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test crotg by comparing it against pre-calculated values
+ */
+CTEST(crotg, c_api_negative_real_negative_img)
+{
+    float sa[2] = {-3.0f, -4.0f};
+    float sb[2] = {-4.0f, -6.0f};
+    float ss[2];
+    float sc;
+    cblas_crotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.5698f, sc, SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82052f, ss[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558f, ss[1], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26498f, sa[0], SINGLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997f, sa[1], SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_csbmv.c b/utest/test_extensions/test_csbmv.c
new file mode 100644
index 000000000..41c24a2b7
--- /dev/null
+++ b/utest/test_extensions/test_csbmv.c
@@ -0,0 +1,606 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CSBMV {
+    float sp_matrix[DATASIZE * (DATASIZE + 1)];
+    float sb_matrix[DATASIZE * DATASIZE * 2];
+    float b_test[DATASIZE * 2 * INCREMENT];
+    float c_test[DATASIZE * 2 * INCREMENT];
+    float c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+// SINGLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * FLT_EPSILON
+// SINGLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 1.19e-07 = 5*e-03
+#define SINGLE_EPS_ZGEMV 5e-03
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CSBMV data_csbmv;
+
+/** 
+ * Transform full-storage symmetric band matrix A to upper (U) or lower (L)
+ * band-packed storage mode.
+ * 
+ * param uplo specifies whether matrix a is upper or lower band-packed.
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * output param a - buffer for holding symmetric band-packed matrix
+ * param lda - specifies the leading dimension of a
+ * param sb_matrix - buffer holding full-storage symmetric band matrix A 
+ * param ldm - specifies the leading dimension of A
+ */
+static void transform_to_band_storage(char uplo, blasint n, blasint k, float* a, blasint lda,
+                                     float* sb_matrix, blasint ldm) 
+{
+    blasint i, j, m;
+    if (uplo == 'L') {
+        for (j = 0; j < n; j++)
+        {
+            m = -j;
+            for (i = 2 * j; i < MIN(2 * n, 2 * (j + k + 1)); i += 2)
+            {
+                a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2];
+                a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2];
+            }
+        }
+    }
+    else {
+        for (j = 0; j < n; j++)
+        {   
+            m = k - j;
+            for (i = MAX(0, 2*(j - k)); i <= j*2; i += 2)
+            {
+                a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2];
+                a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2];
+            }
+        }
+    }
+}
+
+/** 
+ * Generate full-storage symmetric band matrix A with k - super-diagonals
+ * from input symmetric packed matrix in lower packed mode (L)
+ * 
+ * output param sb_matrix - buffer for holding full-storage symmetric band matrix.
+ * param sp_matrix - buffer holding input symmetric packed matrix
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ */
+static void get_symmetric_band_matr(float *sb_matrix, float *sp_matrix, blasint n, blasint k)
+{
+    blasint m;
+    blasint i, j;
+    m = 0;
+    for (i = 0; i < n; i++)
+    {
+        for (j = 0; j < n * 2; j += 2)
+        {
+            // Make matrix band with k super-diagonals
+            if (fabs((i+1) - ceil((j+1)/2.0f)) > k) 
+            {
+                sb_matrix[i * n * 2 + j] = 0.0f;
+                sb_matrix[i * n * 2 + j + 1] = 0.0f;
+                continue;
+            }
+
+            if (j / 2 < i)
+            {
+                sb_matrix[i * n * 2 + j] = 
+                        sb_matrix[j * n + i * 2];
+                sb_matrix[i * n * 2 + j + 1] = 
+                        sb_matrix[j * n + i * 2 + 1];
+            }
+            else
+            {
+                sb_matrix[i * n * 2 + j] = sp_matrix[m++];
+                sb_matrix[i * n * 2 + j + 1] = sp_matrix[m++];
+            }
+        }
+    }
+}
+
+/** 
+ * Check if error function was called with expected function name
+ * and param info
+ * 
+ * param uplo specifies whether matrix a is upper or lower band-packed.
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b_test
+ * param inc_c - stride of vector c_test
+ * param expected_info - expected invalid parameter number in csbmv
+ * return TRUE if everything is ok, otherwise FALSE 
+ */
+static int check_badargs(char uplo, blasint n, blasint k, blasint lda, blasint inc_b,
+                          blasint inc_c, int expected_info)
+{
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float a[2];
+    srand_generate(a, 2);
+
+    set_xerbla("CSBMV ", expected_info);
+
+    BLASFUNC(csbmv)(&uplo, &n, &k, alpha, a, &lda, data_csbmv.b_test, 
+                    &inc_b, beta, data_csbmv.c_test, &inc_c);
+
+    return check_error();
+}
+
+/**
+ * Comapare results computed by csbmv and cgemv 
+ * since csbmv is cgemv for symmetric band matrix
+ * 
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b_test
+ * param beta - scaling factor for vector c_test
+ * param inc_c - stride of vector c_test
+ * param lda - specifies the leading dimension of a
+ * return norm of differences 
+ */
+static float check_csbmv(char uplo, blasint n, blasint k, float *alpha, blasint lda, 
+    blasint inc_b, float *beta, blasint inc_c, blasint ldm)
+{
+    blasint i;
+
+    // Trans param for gemv (can use any, since the input matrix is symmetric)
+    char trans = 'N';
+
+    // Symmetric band packed matrix for sbmv
+    float *a = (float*) malloc(lda * n * 2 * sizeof(float));
+
+    // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test 
+    srand_generate(data_csbmv.sp_matrix, n * (n + 1));
+    srand_generate(data_csbmv.b_test, n * inc_b * 2);
+    srand_generate(data_csbmv.c_test, n * inc_c * 2);
+
+    // Copy vector c_test for cgemv
+    for (i = 0; i < n * inc_c * 2; i++)
+        data_csbmv.c_verify[i] = data_csbmv.c_test[i];
+
+    // Generate full-storage symmetric band matrix
+    // with k super-diagonals from symmetric packed matrix
+    get_symmetric_band_matr(data_csbmv.sb_matrix, data_csbmv.sp_matrix, n, k);
+
+    // Transform symmetric band matrix from conventional
+    // full matrix storage  to band storage for csbmv
+    transform_to_band_storage(uplo, n, k, a, lda, data_csbmv.sb_matrix, ldm);
+
+    BLASFUNC(cgemv)(&trans, &n, &n, alpha, data_csbmv.sb_matrix, &ldm, data_csbmv.b_test,
+                    &inc_b, beta, data_csbmv.c_verify, &inc_c);
+
+    BLASFUNC(csbmv)(&uplo, &n, &k, alpha, a, &lda,
+                    data_csbmv.b_test, &inc_b, beta, data_csbmv.c_test, &inc_c);
+
+    // Find the differences between output vector caculated by csbmv and cgemv
+    for (i = 0; i < n * inc_c * 2; i++)
+        data_csbmv.c_test[i] -= data_csbmv.c_verify[i];
+    free(a);
+    // Find the norm of differences
+    return BLASFUNC(scnrm2)(&n, data_csbmv.c_test, &inc_c);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 0 
+ */
+CTEST(csbmv, upper_k_0_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 1
+ */
+CTEST(csbmv, upper_k_1_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 1;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, upper_k_2_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, upper_k_2_inc_b_2_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 2
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, upper_k_2_inc_b_2_inc_c_2_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 2;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 0
+ */
+CTEST(csbmv, lower_k_0_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 1
+ */
+CTEST(csbmv, lower_k_1_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 1;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, lower_k_2_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, lower_k_2_inc_b_2_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test csbmv by comparing it against cgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 2
+ * Number of super-diagonals k is 2
+ */
+CTEST(csbmv, lower_k_2_inc_b_2_inc_c_2_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 2;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    float alpha[] = {2.0f, 1.0f};
+    float beta[] = {2.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/** 
+ * Check if output matrix a contains any NaNs
+ */
+CTEST(csbmv, check_for_NaN)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {1.0f, 1.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+
+    ASSERT_TRUE(norm == norm); /* NaN == NaN is false  */
+}
+
+/**
+ * Test error function for an invalid param uplo.
+ * Uplo specifies whether a is in upper (U) or lower (L) band-packed storage mode.
+ */
+CTEST(csbmv, xerbla_uplo_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'O';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 1;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/** 
+ * Test error function for an invalid param N -
+ * number of rows and columns of A. Must be at least zero.
+ */
+CTEST(csbmv, xerbla_n_invalid)
+{
+    blasint n = INVALID, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 2;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Check if n - number of rows and columns of A equal zero.
+ */
+CTEST(csbmv, check_n_zero)
+{
+    blasint n = 0, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = 1;
+    char uplo = 'U';
+
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_csbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS_ZGEMV);
+}
+
+/**
+ * Test error function for an invalid param inc_b -
+ * stride of vector b_test. Can't be zero. 
+ */
+CTEST(csbmv, xerbla_inc_b_zero)
+{
+    blasint n = 1, inc_b = 0, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 8;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_c -
+ * stride of vector c_test. Can't be zero. 
+ */
+CTEST(csbmv, xerbla_inc_c_zero)
+{
+    blasint n = 1, inc_b = 1, inc_c = 0;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 11;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param k -
+ * number of super-diagonals of A. Must be at least zero.
+ */
+CTEST(csbmv, xerbla_k_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = INVALID;
+    blasint lda = 1;
+    int expected_info = 3;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of a. Must be at least (k+1).
+ */
+CTEST(csbmv, xerbla_lda_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = INVALID;
+    int expected_info = 6;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_cscal.c b/utest/test_extensions/test_cscal.c
new file mode 100644
index 000000000..009c600ad
--- /dev/null
+++ b/utest/test_extensions/test_cscal.c
@@ -0,0 +1,164 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CSCAL {
+    float x_test[DATASIZE * 2 * INCREMENT];
+    float x_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CSCAL data_cscal;
+
+/**
+ * cscal reference code
+ *
+ * param n - number of elements of vector x
+ * param alpha - scaling factor for the vector product
+ * param x - buffer holding input vector x
+ * param inc - stride of vector x
+ */
+static void cscal_trusted(blasint n, float *alpha, float* x, blasint inc){
+    blasint i, ip = 0;
+    blasint inc_x2 = 2 * inc;
+    float temp;
+    for (i = 0; i < n; i++)
+	{
+        temp = alpha[0] * x[ip] - alpha[1] * x[ip+1];
+		x[ip+1] = alpha[0] * x[ip+1] + alpha[1] * x[ip];
+        x[ip] = temp;
+        ip += inc_x2;
+    }
+}
+
+/**
+ * Comapare results computed by cscal and cscal_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param n - number of elements of vector x
+ * param alpha - scaling factor for the vector product
+ * param inc - stride of vector x
+ * return norm of differences
+ */
+static float check_cscal(char api, blasint n, float *alpha, blasint inc)
+{
+    blasint i;
+
+    // Fill vectors a 
+    srand_generate(data_cscal.x_test, n * inc * 2);
+
+    // Copy vector x for cscal_trusted
+    for (i = 0; i < n * 2 * inc; i++)
+        data_cscal.x_verify[i] = data_cscal.x_test[i];
+
+    cscal_trusted(n, alpha, data_cscal.x_verify, inc);
+
+    if(api == 'F')
+        BLASFUNC(cscal)(&n, alpha, data_cscal.x_test, &inc);
+    else
+        cblas_cscal(n, alpha, data_cscal.x_test, inc);
+
+    // Find the differences between output vector computed by cscal and cscal_trusted
+    for (i = 0; i < n * 2 * inc; i++)
+        data_cscal.x_verify[i] -= data_cscal.x_test[i];
+
+    // Find the norm of differences
+    return BLASFUNC(scnrm2)(&n, data_cscal.x_verify, &inc);
+}
+
+/**
+ * Fortran API specific test
+ * Test cscal by comparing it against reference
+ */
+CTEST(cscal, alpha_r_zero_alpha_i_not_zero)
+{
+    blasint N = DATASIZE;
+    blasint inc = 1;
+    float alpha[2] = {0.0f, 1.0f};
+
+    float norm = check_cscal('F', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test cscal by comparing it against reference
+ */
+CTEST(cscal, alpha_r_zero_alpha_i_zero_inc_2)
+{
+    blasint N = DATASIZE;
+    blasint inc = 2;
+    float alpha[2] = {0.0f, 0.0f};
+
+    float norm = check_cscal('F', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cscal by comparing it against reference
+ */
+CTEST(cscal, c_api_alpha_r_zero_alpha_i_not_zero)
+{
+    blasint N = DATASIZE;
+    blasint inc = 1;
+    float alpha[2] = {0.0f, 1.0f};
+
+    float norm = check_cscal('C', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test cscal by comparing it against reference
+ */
+CTEST(cscal, c_api_alpha_r_zero_alpha_i_zero_inc_2)
+{
+    blasint N = DATASIZE;
+    blasint inc = 2;
+    float alpha[2] = {0.0f, 0.0f};
+
+    float norm = check_cscal('C', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_cspmv.c b/utest/test_extensions/test_cspmv.c
new file mode 100644
index 000000000..b64c90e3a
--- /dev/null
+++ b/utest/test_extensions/test_cspmv.c
@@ -0,0 +1,428 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_CSPMV {
+    float a_verify[DATASIZE * DATASIZE * 2];
+    float a_test[DATASIZE * (DATASIZE + 1)];
+    float b_test[DATASIZE * 2 * INCREMENT];
+    float c_test[DATASIZE * 2 * INCREMENT];
+    float c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CSPMV data_cspmv;
+
+/**
+ * Compute spmv via gemv since spmv is gemv for symmetric packed matrix
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param a - buffer holding input matrix A
+ * param b - Buffer holding input vector b
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param c - buffer holding input/output vector c
+ * param inc_c - stride of vector c
+ * output param data_cspmv.c_verify - matrix computed by gemv
+ */
+static void cspmv_trusted(char uplo, blasint n, float *alpha, float *a,
+                          float *b, blasint inc_b, float *beta, float *c,
+                          blasint inc_c)
+{
+    blasint k;
+    blasint i, j;
+
+    // param for gemv (can use any, since the input matrix is symmetric)
+    char trans = 'N';
+
+    // Unpack the input symmetric packed matrix
+    if (uplo == 'L')
+    {
+        k = 0;
+        for (i = 0; i < n; i++)
+        {
+            for (j = 0; j < n * 2; j += 2)
+            {
+                if (j / 2 < i)
+                {
+                    data_cspmv.a_verify[i * n * 2 + j] =
+                        data_cspmv.a_verify[j * n + i * 2];
+                    data_cspmv.a_verify[i * n * 2 + j + 1] =
+                        data_cspmv.a_verify[j * n + i * 2 + 1];
+                }
+                else
+                {
+                    data_cspmv.a_verify[i * n * 2 + j] = a[k++];
+                    data_cspmv.a_verify[i * n * 2 + j + 1] = a[k++];
+                }
+            }
+        }
+    }
+    else
+    {
+        k = n * (n + 1) - 1;
+        for (j = 2 * n - 1; j >= 0; j -= 2)
+        {
+            for (i = n - 1; i >= 0; i--)
+            {
+                if (j / 2 < i)
+                {
+                    data_cspmv.a_verify[i * n * 2 + j] =
+                        data_cspmv.a_verify[(j - 1) * n + i * 2 + 1];
+                    data_cspmv.a_verify[i * n * 2 + j - 1] =
+                        data_cspmv.a_verify[(j - 1) * n + i * 2];
+                }
+                else
+                {
+                    data_cspmv.a_verify[i * n * 2 + j] = a[k--];
+                    data_cspmv.a_verify[i * n * 2 + j - 1] = a[k--];
+                }
+            }
+        }
+    }
+
+    // Run gemv with the unpacked matrix
+    BLASFUNC(cgemv)(&trans, &n, &n, alpha, data_cspmv.a_verify, &n, b,
+                    &inc_b, beta, data_cspmv.c_verify, &inc_c);
+}
+
+/**
+ * Comapare results computed by cspmv and cspmv_trusted
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences
+ */
+static float check_cspmv(char uplo, blasint n, float *alpha, blasint inc_b,
+                         float *beta, blasint inc_c)
+{
+    blasint i;
+
+    // Fill symmetric packed maxtix a, vectors b and c
+    srand_generate(data_cspmv.a_test, n * (n + 1));
+    srand_generate(data_cspmv.b_test, 2 * n * inc_b);
+    srand_generate(data_cspmv.c_test, 2 * n * inc_c);
+
+    // Copy vector c for cspmv_trusted
+    for (i = 0; i < n * 2 * inc_c; i++)
+        data_cspmv.c_verify[i] = data_cspmv.c_test[i];
+
+    cspmv_trusted(uplo, n, alpha, data_cspmv.a_test, data_cspmv.b_test,
+                  inc_b, beta, data_cspmv.c_verify, inc_c);
+
+    BLASFUNC(cspmv)(&uplo, &n, alpha, data_cspmv.a_test, data_cspmv.b_test,
+                    &inc_b, beta, data_cspmv.c_test, &inc_c);
+
+    // Find the differences between output vector computed by cspmv and cspmv_trusted
+    for (i = 0; i < n * 2 * inc_c; i++)
+        data_cspmv.c_test[i] -= data_cspmv.c_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(scnrm2)(&n, data_cspmv.c_test, &inc_c);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param inc_b - stride of vector b
+ * param inc_c - stride of vector c
+ * param expected_info - expected invalid parameter number in cspmv
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char uplo, blasint n, blasint inc_b,
+                         blasint inc_c, int expected_info)
+{
+    float alpha[] = {1.0, 1.0};
+    float beta[] = {0.0, 0.0};
+
+    set_xerbla("CSPMV ", expected_info);
+
+    BLASFUNC(cspmv)(&uplo, &n, alpha, data_cspmv.a_test, data_cspmv.b_test,
+                    &inc_b, beta, data_cspmv.c_test, &inc_c);
+
+    return check_error();
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cspmv, upper_inc_b_1_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 2
+ */
+CTEST(cspmv, upper_inc_b_1_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 2;
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 1
+ */
+CTEST(cspmv, upper_inc_b_2_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 1;
+    char uplo = 'U';
+    float alpha[] = {1.0f, 0.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(cspmv, upper_inc_b_2_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 2;
+    char uplo = 'U';
+    float alpha[] = {2.5, -2.1};
+    float beta[] = {0.0f, 1.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(cspmv, lower_inc_b_1_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 2
+ */
+CTEST(cspmv, lower_inc_b_1_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 2;
+    char uplo = 'L';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 1
+ */
+CTEST(cspmv, lower_inc_b_2_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 1;
+    char uplo = 'L';
+    float alpha[] = {1.0f, 0.0f};
+    float beta[] = {1.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Test cspmv by comparing it against cgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(cspmv, lower_inc_b_2_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 2;
+    char uplo = 'L';
+    float alpha[] = {2.5, -2.1};
+    float beta[] = {0.0f, 1.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_TOL);
+}
+
+/**
+ * Check if output matrix A contains any NaNs
+ */
+CTEST(cspmv, check_for_NaN)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    float alpha[] = {1.0f, 1.0f};
+    float beta[] = {0.0f, 0.0f};
+
+    float norm = check_cspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_TRUE(norm == norm); /* NaN == NaN is false */
+}
+
+/**
+ * Test error function for an invalid param uplo.
+ * uplo specifies whether A is upper or lower triangular.
+ */
+CTEST(cspmv, xerbla_uplo_invalid)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param N -
+ * number of rows and columns of A. Must be at least zero.
+ */
+CTEST(cspmv, xerbla_N_invalid)
+{
+    blasint N = INVALID, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_b -
+ * stride of vector b. Can't be zero.
+ */
+CTEST(cspmv, xerbla_inc_b_zero)
+{
+    blasint N = DATASIZE, inc_b = 0, inc_c = 1;
+    char uplo = 'U';
+    int expected_info = 6;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_c -
+ * stride of vector c. Can't be zero.
+ */
+CTEST(cspmv, xerbla_inc_c_zero)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 0;
+    char uplo = 'U';
+    int expected_info = 9;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_ctrmv.c b/utest/test_extensions/test_ctrmv.c
new file mode 100644
index 000000000..2a3f27416
--- /dev/null
+++ b/utest/test_extensions/test_ctrmv.c
@@ -0,0 +1,266 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 300
+#define INCREMENT 2
+
+struct DATA_CTRMV {
+	float a_test[DATASIZE * DATASIZE * 2];
+	float a_verify[DATASIZE * DATASIZE * 2];
+	float x_test[DATASIZE * INCREMENT * 2];
+	float x_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CTRMV data_ctrmv;
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A
+ * and comparing it with the non-conjugate ctrmv.
+ *
+ * param uplo specifies whether A is upper or lower triangular
+ * param trans specifies op(A), the transposition (conjugation) operation applied to A
+ * param diag specifies whether the matrix A is unit triangular or not.
+ * param n - numbers of rows and columns of A
+ * param lda - leading dimension of matrix A
+ * param incx - increment for the elements of x
+ * return norm of difference
+ */
+static float check_ctrmv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx)
+{
+	blasint i;
+	float alpha_conj[] = {1.0f, 0.0f}; 
+	char trans_verify = trans;
+
+	srand_generate(data_ctrmv.a_test, n * lda * 2);
+	srand_generate(data_ctrmv.x_test, n * incx * 2);
+
+	for (i = 0; i < n * lda * 2; i++)
+		data_ctrmv.a_verify[i] = data_ctrmv.a_test[i];
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ctrmv.x_verify[i] = data_ctrmv.x_test[i];
+
+	if (trans == 'R'){
+		cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ctrmv.a_verify, lda, lda);
+		trans_verify = 'N';
+	}
+
+	BLASFUNC(ctrmv)(&uplo, &trans_verify, &diag, &n, data_ctrmv.a_verify, &lda,
+	 				data_ctrmv.x_verify, &incx);
+
+	BLASFUNC(ctrmv)(&uplo, &trans, &diag, &n, data_ctrmv.a_test, &lda,
+	 				data_ctrmv.x_test, &incx);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ctrmv.x_verify[i] -= data_ctrmv.x_test[i];
+
+	return BLASFUNC(scnrm2)(&n, data_ctrmv.x_verify, &incx);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ctrmv, conj_notrans_upper_not_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ */
+CTEST(ctrmv, conj_notrans_upper_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ctrmv, conj_notrans_lower_not_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ */
+CTEST(ctrmv, conj_notrans_lower_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrmv, conj_notrans_upper_not_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrmv, conj_notrans_upper_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrmv, conj_notrans_lower_not_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test ctrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrmv, conj_notrans_lower_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_ctrsv.c b/utest/test_extensions/test_ctrsv.c
new file mode 100644
index 000000000..0e639bb2a
--- /dev/null
+++ b/utest/test_extensions/test_ctrsv.c
@@ -0,0 +1,267 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 300
+#define INCREMENT 2
+
+struct DATA_CTRSV {
+	float a_test[DATASIZE * DATASIZE * 2];
+	float a_verify[DATASIZE * DATASIZE * 2];
+	float x_test[DATASIZE * INCREMENT * 2];
+	float x_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX
+static struct DATA_CTRSV data_ctrsv;
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A
+ * and comparing it with the non-conjugate ctrsv.
+ *
+ * param uplo specifies whether A is upper or lower triangular
+ * param trans specifies op(A), the transposition (conjugation) operation applied to A
+ * param diag specifies whether the matrix A is unit triangular or not.
+ * param n - numbers of rows and columns of A
+ * param lda - leading dimension of matrix A
+ * param incx - increment for the elements of x
+ * return norm of difference
+ */
+static float check_ctrsv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx)
+{
+	blasint i;
+	float alpha_conj[] = {1.0f, 0.0f}; 
+	char trans_verify = trans;
+
+	srand_generate(data_ctrsv.a_test, n * lda * 2);
+	srand_generate(data_ctrsv.x_test, n * incx * 2);
+
+	for (i = 0; i < n * lda * 2; i++)
+		data_ctrsv.a_verify[i] = data_ctrsv.a_test[i];
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ctrsv.x_verify[i] = data_ctrsv.x_test[i];
+
+	if (trans == 'R'){
+		cblas_cimatcopy(CblasColMajor, CblasConjNoTrans, n, n, 
+						alpha_conj, data_ctrsv.a_verify, lda, lda);
+		trans_verify = 'N';
+	}
+
+	BLASFUNC(ctrsv)(&uplo, &trans_verify, &diag, &n, data_ctrsv.a_verify, 
+					&lda, data_ctrsv.x_verify, &incx);
+
+	BLASFUNC(ctrsv)(&uplo, &trans, &diag, &n, data_ctrsv.a_test, &lda,
+	 				data_ctrsv.x_test, &incx);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ctrsv.x_verify[i] -= data_ctrsv.x_test[i];
+
+	return BLASFUNC(scnrm2)(&n, data_ctrsv.x_verify, &incx);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ctrsv, conj_notrans_upper_not_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ */
+CTEST(ctrsv, conj_notrans_upper_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ctrsv, conj_notrans_lower_not_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ */
+CTEST(ctrsv, conj_notrans_lower_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrsv, conj_notrans_upper_not_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrsv, conj_notrans_upper_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrsv, conj_notrans_lower_not_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ctrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ctrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ctrsv, conj_notrans_lower_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	float norm = check_ctrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0f, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_damin.c b/utest/test_extensions/test_damin.c
new file mode 100644
index 000000000..736921fa3
--- /dev/null
+++ b/utest/test_extensions/test_damin.c
@@ -0,0 +1,354 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_DOUBLE
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N; i ++) {
+      x[i] = i - 1000;
+   }
+   x[8] = 0.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(fabs(x[0]), amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.1};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.1, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, 1.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.1, 1.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 1.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test damin by comparing it against pre-calculated values
+ */
+CTEST(damin, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   double amin = BLASFUNC(damin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, amin, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_daxpby.c b/utest/test_extensions/test_daxpby.c
new file mode 100644
index 000000000..6e77c7c7c
--- /dev/null
+++ b/utest/test_extensions/test_daxpby.c
@@ -0,0 +1,799 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_DAXPBY{
+    double x_test[DATASIZE * INCREMENT];
+    double x_verify[DATASIZE * INCREMENT];
+    double y_test[DATASIZE * INCREMENT];
+    double y_verify[DATASIZE * INCREMENT];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DAXPBY data_daxpby;
+
+/**
+ * Fortran API specific function
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double check_daxpby(blasint n, double alpha, blasint incx, double beta, blasint incy)
+{
+    blasint i;
+
+    // dscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    drand_generate(data_daxpby.x_test, n * incx_abs);
+    drand_generate(data_daxpby.y_test, n * incy_abs);
+
+    // Copy vector x for daxpy
+    for (i = 0; i < n * incx_abs; i++)
+        data_daxpby.x_verify[i] = data_daxpby.x_test[i];
+
+    // Copy vector y for dscal
+    for (i = 0; i < n * incy_abs; i++)
+        data_daxpby.y_verify[i] = data_daxpby.y_test[i];
+
+    // Find beta*y
+    BLASFUNC(dscal)(&n, &beta, data_daxpby.y_verify, &incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    BLASFUNC(daxpy)(&n, &alpha, data_daxpby.x_verify, &incx,
+                        data_daxpby.y_verify, &incy);
+    
+    BLASFUNC(daxpby)(&n, &alpha, data_daxpby.x_test, &incx,
+                        &beta, data_daxpby.y_test, &incy);
+
+    // Find the differences between output vector caculated by daxpby and daxpy
+    for (i = 0; i < n * incy_abs; i++)
+        data_daxpby.y_test[i] -= data_daxpby.y_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dnrm2)(&n, data_daxpby.y_test, &incy_abs);
+}
+
+/**
+ * C API specific function
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double c_api_check_daxpby(blasint n, double alpha, blasint incx, double beta, blasint incy)
+{
+    blasint i;
+
+    // dscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Copy vector x for daxpy
+    for (i = 0; i < n * incx_abs; i++)
+        data_daxpby.x_verify[i] = data_daxpby.x_test[i];
+
+    // Copy vector y for dscal
+    for (i = 0; i < n * incy_abs; i++)
+        data_daxpby.y_verify[i] = data_daxpby.y_test[i];
+
+    // Find beta*y
+    cblas_dscal(n, beta, data_daxpby.y_verify, incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    cblas_daxpy(n, alpha, data_daxpby.x_verify, incx,
+                        data_daxpby.y_verify, incy);
+    
+    cblas_daxpby(n, alpha, data_daxpby.x_test, incx,
+                        beta, data_daxpby.y_test, incy);
+
+    // Find the differences between output vector caculated by daxpby and daxpy
+    for (i = 0; i < n * incy_abs; i++)
+        data_daxpby.y_test[i] -= data_daxpby.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_dnrm2(n, data_daxpby.y_test, incy_abs);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(daxpby, inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(daxpby, inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha = 3.0;
+    double beta = 4.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    double alpha = 5.0;
+    double beta = 4.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(daxpby, inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    double alpha = 1.0;
+    double beta = 6.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(daxpby, inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    double alpha = 7.0;
+    double beta = 3.5;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(daxpby, inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+*/
+CTEST(daxpby, inc_x_1_inc_y_2_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(daxpby, inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * Scalar beta is zero
+*/
+CTEST(daxpby, inc_x_2_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(daxpby, inc_x_1_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(daxpby, inc_x_2_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(daxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(daxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(daxpby, check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_daxpby(n, alpha, incx, beta, incy);
+    
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(daxpby, c_api_inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, c_api_inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha = 3.0;
+    double beta = 4.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(daxpby, c_api_inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    double alpha = 5.0;
+    double beta = 4.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(daxpby, c_api_inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    double alpha = 1.0;
+    double beta = 6.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(daxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    double alpha = 7.0;
+    double beta = 3.5;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+*/
+CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * 
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * Scalar beta is zero
+*/
+CTEST(daxpby, c_api_inc_x_2_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * 
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * 
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(daxpby, c_api_inc_x_2_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * 
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(daxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test daxpby by comparing it with dscal and daxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(daxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(daxpby, c_api_check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = c_api_check_daxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dgeadd.c b/utest/test_extensions/test_dgeadd.c
new file mode 100644
index 000000000..8f93a842e
--- /dev/null
+++ b/utest/test_extensions/test_dgeadd.c
@@ -0,0 +1,878 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+
+struct DATA_DGEADD{
+    double a_test[M * N];
+    double c_test[M * N];
+    double c_verify[M * N];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DGEADD data_dgeadd;
+
+/**
+ * dgeadd reference implementation
+ *
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param aptr - refer to matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param cptr - refer to matrix C
+ * param ldc - leading dimension of C
+ */
+static void dgeadd_trusted(blasint m, blasint n, double alpha, double *aptr,
+                           blasint lda, double beta, double *cptr, blasint ldc)
+{
+    blasint i;
+
+    for (i = 0; i < n; i++)
+    {
+        cblas_daxpby(m, alpha, aptr, 1, beta, cptr, 1);
+        aptr += lda;
+        cptr += ldc;
+    }
+}
+
+/**
+ * Test dgeadd by comparing it against reference
+ * Compare with the following options:
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static double check_dgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                          blasint m, blasint n, double alpha, blasint lda,
+                          double beta, blasint ldc)
+{
+    blasint i;
+    blasint cols = m, rows = n;
+
+    if (order == CblasRowMajor)
+    {
+        rows = m;
+        cols = n;
+    }
+
+    // Fill matrix A, C
+    drand_generate(data_dgeadd.a_test, lda * rows);
+    drand_generate(data_dgeadd.c_test, ldc * rows);
+
+    // Copy matrix C for dgeadd
+    for (i = 0; i < ldc * rows; i++)
+        data_dgeadd.c_verify[i] = data_dgeadd.c_test[i];
+
+    dgeadd_trusted(cols, rows, alpha, data_dgeadd.a_test, lda,
+                   beta, data_dgeadd.c_verify, ldc);
+
+    if (api == 'F')
+        BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda,
+         &beta, data_dgeadd.c_test, &ldc);
+    else
+        cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda,
+                     beta, data_dgeadd.c_test, ldc);
+
+    // Find the differences between output matrix caculated by dgeadd and sgemm
+    return dmatrix_difference(data_dgeadd.c_test, data_dgeadd.c_verify, cols, rows, ldc);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param lda - leading dimension of A
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in dgeadd
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                         blasint m, blasint n, blasint lda,
+                         blasint ldc, int expected_info)
+{
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    set_xerbla("DGEADD ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(dgeadd)(&m, &n, &alpha, data_dgeadd.a_test, &lda,
+                         &beta, data_dgeadd.c_test, &ldc);
+    else 
+        cblas_dgeadd(order, m, n, alpha, data_dgeadd.a_test, lda,
+                 beta, data_dgeadd.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(dgeadd, matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 3.0;
+    double beta = 3.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(dgeadd, matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 0.0;
+    double beta = 2.5;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(dgeadd, matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 3.0;
+    double beta = 0.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(dgeadd, matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(dgeadd, matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C
+ * Must be at least zero.
+ */
+CTEST(dgeadd, xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ */
+CTEST(dgeadd, xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ */
+CTEST(dgeadd, xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ */
+CTEST(dgeadd, xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - number of columns of A, C equal zero.
+ */
+CTEST(dgeadd, n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if m - number of rows of A and C equal zero.
+ */
+CTEST(dgeadd, m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 2.0;
+    double beta = 3.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 4.0;
+    double beta = 2.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(dgeadd, c_api_matrix_n_50_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N / 2;
+    blasint m = M;
+
+    blasint lda = n;
+    blasint ldc = n;
+
+    double alpha = 3.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 0.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 3.0;
+    double beta = 0.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 0.0;
+    double beta = 0.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(dgeadd, c_api_matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha = 3.0;
+    double beta = 4.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order -
+ * specifies whether A and C stored in
+ * row-major order or column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_invalid_order)
+{
+    CBLAS_ORDER order = INVALID;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 0;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(dgeadd, c_api_xerbla_n_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(dgeadd, c_api_xerbla_m_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(dgeadd, c_api_xerbla_lda_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(dgeadd, c_api_xerbla_ldc_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Check if n - number of columns of A, C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if m - number of rows of A and C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(dgeadd, c_api_m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha = 1.0;
+    double beta = 1.0;
+
+    double norm = check_dgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dgemmt.c b/utest/test_extensions/test_dgemmt.c
new file mode 100644
index 000000000..22dcaf2aa
--- /dev/null
+++ b/utest/test_extensions/test_dgemmt.c
@@ -0,0 +1,1442 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_DGEMMT {
+    double a_test[DATASIZE * DATASIZE];
+    double b_test[DATASIZE * DATASIZE];
+    double c_test[DATASIZE * DATASIZE];
+    double c_verify[DATASIZE * DATASIZE];
+    double c_gemm[DATASIZE * DATASIZE];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DGEMMT data_dgemmt;
+
+/**
+ * Compute gemmt via gemm since gemmt is gemm but updates only 
+ * the upper or lower triangular part of the result matrix
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ */
+static void dgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                           char transb, blasint m, blasint k, double alpha, blasint lda, 
+                           blasint ldb, double beta, blasint ldc)
+{
+    blasint i, j;
+
+    if(api == 'F')
+        BLASFUNC(dgemm)(&transa, &transb, &m, &m, &k, &alpha, data_dgemmt.a_test, &lda,
+                        data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_gemm, &ldc);
+    else
+        cblas_dgemm(order, transa, transb, m, m, k, alpha, data_dgemmt.a_test, lda,
+                data_dgemmt.b_test, ldb, beta, data_dgemmt.c_gemm, ldc);
+
+    if (uplo == 'L' || uplo == CblasLower)
+    {
+        for (i = 0; i < m; i++)
+            for (j = i; j < m; j++)
+                data_dgemmt.c_verify[i * ldc + j] =
+                    data_dgemmt.c_gemm[i * ldc + j];
+    } else {
+        for (i = 0; i < m; i++)
+            for (j = 0; j <= i; j++)
+                data_dgemmt.c_verify[i * ldc + j] =
+                    data_dgemmt.c_gemm[i * ldc + j];
+    }
+}
+
+/**
+ * Comapare results computed by dgemmt and dgemmt_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static double check_dgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                          char transb, blasint m, blasint k, double alpha, blasint lda, 
+                          blasint ldb, double beta, blasint ldc)
+{
+    blasint i;
+    blasint b_cols;
+    blasint a_cols;
+    blasint inc = 1;
+    blasint size_c = m * ldc;
+
+    if(order == CblasColMajor){
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = m;
+        else a_cols = k;
+
+        if (transb == 'T' || transb == 'C' || 
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = k;
+        else b_cols = m;
+    } else {
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = k;
+        else a_cols = m;
+
+        if (transb == 'T' || transb == 'C' ||
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = m;
+        else b_cols = k;
+    }
+
+    drand_generate(data_dgemmt.a_test, a_cols * lda);
+    drand_generate(data_dgemmt.b_test, b_cols * ldb);
+    drand_generate(data_dgemmt.c_test, m * ldc);
+
+    for (i = 0; i < m * ldc; i++)
+        data_dgemmt.c_gemm[i] = data_dgemmt.c_verify[i] = data_dgemmt.c_test[i];
+
+    dgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc);
+
+    if (api == 'F')
+        BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test,
+                         &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc);
+    else
+        cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda,
+                    data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc);
+
+    for (i = 0; i < m * ldc; i++)
+        data_dgemmt.c_verify[i] -= data_dgemmt.c_test[i];
+
+    return BLASFUNC(dnrm2)(&size_c, data_dgemmt.c_verify, &inc) / size_c;
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in dgemmt
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                         char transb, blasint m, blasint k, blasint lda, blasint ldb,
+                         blasint ldc, int expected_info)
+{
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    set_xerbla("DGEMMT ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(dgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_dgemmt.a_test,
+                         &lda, data_dgemmt.b_test, &ldb, &beta, data_dgemmt.c_test, &ldc);
+    else
+        cblas_dgemmt(order, uplo, transa, transb, m, k, alpha, data_dgemmt.a_test, lda,
+                    data_dgemmt.b_test, ldb, beta, data_dgemmt.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'T';
+    char uplo = 'U';
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'U';
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    char transa = 'T', transb = 'N';
+    char uplo = 'L';
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'T';
+    char uplo = 'L';
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'L';
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_colmajor_upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_colmajor_upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, c_api_colmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, c_api_colmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_colmajor_lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_colmajor_lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, c_api_colmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, c_api_colmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 100;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, c_api_rowmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 100;
+    double alpha = 1.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    double alpha = 1.0;
+    double beta = 0.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 1.5;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 0.0;
+    double beta = 2.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dgemmt by comparing it against dgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(dgemmt, c_api_rowmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha = 2.0;
+    double beta = 1.0;
+
+    double norm = check_dgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param uplo.
+ * Must be upper (U) or lower (L).
+ */
+CTEST(dgemmt, xerbla_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transa.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(dgemmt, xerbla_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'O', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transb.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(dgemmt, xerbla_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'O';
+    char uplo = 'U';
+    int expected_info = 3;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 4;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 5;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(dgemmt, xerbla_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 8;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(dgemmt, xerbla_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 10;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(dgemmt, xerbla_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 13;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test.
+ * Test error function for an invalid param order.
+ * Must be column or row major.
+ */
+CTEST(dgemmt, xerbla_c_api_major_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 0;
+
+    int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(dgemmt, xerbla_c_api_colmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B transposed.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(dgemmt, xerbla_c_api_rowmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dimatcopy.c b/utest/test_extensions/test_dimatcopy.c
new file mode 100644
index 000000000..811c356b3
--- /dev/null
+++ b/utest/test_extensions/test_dimatcopy.c
@@ -0,0 +1,915 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_DIMATCOPY {
+    double a_test[DATASIZE* DATASIZE];
+    double a_verify[DATASIZE* DATASIZE];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DIMATCOPY data_dimatcopy;
+
+/**
+ * Comapare results computed by dimatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * return norm of difference between openblas and reference func
+ */
+static double check_dimatcopy(char api, char order, char trans, blasint rows, blasint cols, double alpha, 
+                             blasint lda_src, blasint lda_dst)
+{
+    blasint m, n;
+    blasint rows_out, cols_out;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    if (order == 'C') {
+        n = rows; m = cols;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        rows_out = n; cols_out = m;
+    }
+    else {
+        rows_out = m; cols_out = n;
+    }
+
+    drand_generate(data_dimatcopy.a_test, lda_src*m);
+
+    if (trans == 'T' || trans == 'C') {
+        dtranspose(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst);
+    } 
+    else {
+        dcopy(m, n, alpha, data_dimatcopy.a_test, lda_src, data_dimatcopy.a_verify, lda_dst);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, 
+                            &lda_src, &lda_dst);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_dimatcopy(corder, ctrans, rows, cols, alpha, data_dimatcopy.a_test, 
+                    lda_src, lda_dst);
+    }
+
+    // Find the differences between output matrix computed by dimatcopy and reference func
+    return dmatrix_difference(data_dimatcopy.a_test, data_dimatcopy.a_verify, cols_out, rows_out, lda_dst);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda_src, blasint lda_dst, int expected_info)
+{
+    double alpha = 1.0;
+
+    set_xerbla("DIMATCOPY", expected_info);
+
+    BLASFUNC(dimatcopy)(&order, &trans, &rows, &cols, &alpha, data_dimatcopy.a_test, 
+                        &lda_src, &lda_dst);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_50_row_100_alpha_one)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_50_row_100_alpha_one)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_50_row_100_alpha_zero)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_50_row_100_alpha_zero)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_50_alpha_one)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 1.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_50_alpha_one)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_50_alpha_zero)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_50_alpha_zero)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, rowmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0
+ */
+CTEST(dimatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_dimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(dimatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(dimatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using row major layout, 
+ * lda_src must be at least n.
+ */
+CTEST(dimatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using column major layout,
+ * lda_src must be at least m.
+ */
+CTEST(dimatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, lda_dst must be at least n.
+ */
+CTEST(dimatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is transposition, lda_dst must be at least m.
+ */
+CTEST(dimatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, lda_dst must be at least m.
+ */
+CTEST(dimatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is transposition, lda_dst must be at least n.
+ */
+CTEST(dimatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_domatcopy.c b/utest/test_extensions/test_domatcopy.c
new file mode 100644
index 000000000..e60b9c83d
--- /dev/null
+++ b/utest/test_extensions/test_domatcopy.c
@@ -0,0 +1,640 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_DOMATCOPY {
+    double a_test[DATASIZE * DATASIZE];
+    double b_test[DATASIZE * DATASIZE];
+    double b_verify[DATASIZE * DATASIZE];
+};
+
+#ifdef BUILD_DOUBLE
+static struct DATA_DOMATCOPY data_domatcopy;
+
+/**
+ * Comapare results computed by domatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param alpha - scaling factor for matrix B
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * return norm of difference between openblas and reference func
+ */
+static double check_domatcopy(char api, char order, char trans, blasint rows, blasint cols, double alpha, 
+                             blasint lda, blasint ldb)
+{
+    blasint b_rows, b_cols;
+    blasint m, n;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    if (order == 'C') {
+        m = cols; n = rows;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        b_rows = n; b_cols = m;
+    }
+    else {
+        b_rows = m; b_cols = n;
+    }
+
+    drand_generate(data_domatcopy.a_test, lda*m);
+
+    if (trans == 'T' || trans == 'C') {
+        dtranspose(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb);
+    } 
+    else {
+        dcopy(m, n, alpha, data_domatcopy.a_test, lda, data_domatcopy.b_verify, ldb);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, 
+                            &lda, data_domatcopy.b_test, &ldb);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_domatcopy(corder, ctrans, rows, cols, alpha, data_domatcopy.a_test, 
+                    lda, data_domatcopy.b_test, ldb);
+    }
+    
+    return dmatrix_difference(data_domatcopy.b_test, data_domatcopy.b_verify, b_cols, b_rows, ldb);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda, blasint ldb, int expected_info)
+{
+    double alpha = 1.0;
+
+    set_xerbla("DOMATCOPY", expected_info);
+
+    BLASFUNC(domatcopy)(&order, &trans, &rows, &cols, &alpha, data_domatcopy.a_test, 
+                        &lda, data_domatcopy.b_test, &ldb);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(domatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 2.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(domatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 2.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(domatcopy, colmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 0.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(domatcopy, colmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 0.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(domatcopy, rowmajor_conjtrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    double alpha = 2.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(domatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha = 2.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Matrix dimensions leave residues from 4 and 2 (specialize
+ * for rt case)
+ * alpha = 1.5
+ */
+CTEST(domatcopy, rowmajor_trans_col_27_row_27)
+{
+    blasint m = 27, n = 27;
+    blasint lda = 27, ldb = 27;
+    char order = 'R';
+    char trans = 'T'; 
+    double alpha = 1.5;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(domatcopy, rowmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha = 0.0;
+
+    double norm = check_domatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test domatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(domatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha = 1.0;
+
+    double norm = check_domatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(domatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(domatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using row major layout,
+ * lda must be at least n.
+ */
+CTEST(domatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using column major layout,
+ * lda must be at least m.
+ */
+CTEST(domatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and
+ * there is no transposition, ldb must be at least n.
+ */
+CTEST(domatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and
+ * there is transposition, ldb must be at least m.
+ */
+CTEST(domatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and
+ * there is no transposition, ldb must be at least m.
+ */
+CTEST(domatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and
+ * there is transposition, ldb must be at least n.
+ */
+CTEST(domatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_drotmg.c b/utest/test_extensions/test_drotmg.c
new file mode 100644
index 000000000..3073c8e3e
--- /dev/null
+++ b/utest/test_extensions/test_drotmg.c
@@ -0,0 +1,414 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#ifdef BUILD_DOUBLE
+
+/**
+ * Fortran API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, y1_zero)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 0.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0;
+	tr_d2 = 2.0;
+	tr_x1 = 8.0;
+	tr_y1 = 0.0;
+
+	tr_param[0] = -2.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, d1_negative)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = -1.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0;
+	tr_d2 = 0.0;
+	tr_x1 = 0.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = -1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, d1_positive_d2_positive_x1_zero)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 0.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0;
+	tr_d2 = 2.0;
+	tr_x1 = 8.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = 1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, scaled_y_greater_than_scaled_x)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 1.0;
+	te_d2 = tr_d2 = -2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0;
+	tr_d2 = 0.0;
+	tr_x1 = 0.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = -1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, c_api_y1_zero)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 0.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0;
+	tr_d2 = 2.0;
+	tr_x1 = 8.0;
+	tr_y1 = 0.0;
+
+	tr_param[0] = -2.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, c_api_d1_negative)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = -1.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0;
+	tr_d2 = 0.0;
+	tr_x1 = 0.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = -1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, c_api_d1_positive_d2_positive_x1_zero)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0;
+	te_d2 = tr_d2 = 2.0;
+	te_x1 = tr_x1 = 0.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0;
+	tr_d2 = 2.0;
+	tr_x1 = 8.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = 1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test drotmg by comparing it against pre-calculated values
+ */
+CTEST(drotmg, c_api_scaled_y_greater_than_scaled_x)
+{
+	double te_d1, tr_d1;
+	double te_d2, tr_d2;
+	double te_x1, tr_x1;
+	double te_y1, tr_y1;
+	double te_param[5];
+	double tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 1.0;
+	te_d2 = tr_d2 = -2.0;
+	te_x1 = tr_x1 = 8.0;
+	te_y1 = tr_y1 = 8.0;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0;
+	tr_d2 = 0.0;
+	tr_x1 = 0.0;
+	tr_y1 = 8.0;
+
+	tr_param[0] = -1.0;
+	tr_param[1] = 0.0;
+	tr_param[2] = 0.0;
+	tr_param[3] = 0.0;
+	tr_param[4] = 0.0;
+
+	//OpenBLAS
+	cblas_drotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, DOUBLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, DOUBLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
+	}
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dsum.c b/utest/test_extensions/test_dsum.c
new file mode 100644
index 000000000..e987c5a42
--- /dev/null
+++ b/utest/test_extensions/test_dsum.c
@@ -0,0 +1,403 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_DOUBLE
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, -1.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, -1.5, 1.0, 1.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, -1.0, -3.0, 2.2, 3.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, -2.2, 3.3};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.2, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 2.0, 2.2, 2.7, -3.3, -5.9};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {0.0, 1.0, 2.2, 3.3, 0.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {0.0, 3.0, 1.0, -2.2, 2.2, -1.7, 3.3, 14.5, 0.0, -9.0};
+
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = BLASFUNC(dsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, -1.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, -1.5, 1.0, 1.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(4.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, -1.0, -3.0, 2.2, 3.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, -2.2, 3.3};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(3.2, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 2.0, 2.2, 2.7, -3.3, -5.9};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {0.0, 1.0, 2.2, 3.3, 0.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {0.0, 3.0, 1.0, -2.2, 2.2, -1.7, 3.3, 14.5, 0.0, -9.0};
+
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.5, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dsum by comparing it against pre-calculated values
+ */
+CTEST(dsum, c_api_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = cblas_dsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(50.0, sum, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dzamax.c b/utest/test_extensions/test_dzamax.c
new file mode 100644
index 000000000..7bc0200c9
--- /dev/null
+++ b/utest/test_extensions/test_dzamax.c
@@ -0,0 +1,293 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, step_zero){
+   blasint i;
+   blasint N = ELEMENTS * 2, inc = 0;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N; i ++) {
+      x[i] = i - 1000;
+   }
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL((fabs(x[0]) + fabs(x[1])), amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.0, 2.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.0, -2.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -3.0, -1.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -3.0, -1.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i;
+   }
+   x[7 * inc * 2] = 1000.0;
+   x[7 * inc * 2 + 1] = 1000.0;
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = -i;
+   }
+   x[7 * inc * 2] = 1000.0;
+   x[7 * inc * 2 + 1] = 1000.0;
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i;
+   }
+   x[7 * inc * 2] = 1000.0;
+   x[7 * inc * 2 + 1] = 1000.0;
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamax by comparing it against pre-calculated values
+ */
+CTEST(dzamax, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = -i;
+   }
+   x[7 * inc * 2] = 1000.0;
+   x[7 * inc * 2 + 1] = 1000.0;
+   double amax = BLASFUNC(dzamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0, amax, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dzamin.c b/utest/test_extensions/test_dzamin.c
new file mode 100644
index 000000000..549881fdc
--- /dev/null
+++ b/utest/test_extensions/test_dzamin.c
@@ -0,0 +1,310 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS * 2, inc = 0;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N; i ++) {
+      x[i] = i - 1000;
+   }
+   x[8] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL((fabs(x[0]) + fabs(x[1])), amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.0, 2.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.0, -2.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+
+/**
+ * Test dzamin by comparing it against pre-calculated values
+ */
+CTEST(dzamin, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   double amin = BLASFUNC(dzamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, amin, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_dzsum.c b/utest/test_extensions/test_dzsum.c
new file mode 100644
index 000000000..5139f59cb
--- /dev/null
+++ b/utest/test_extensions/test_dzsum.c
@@ -0,0 +1,403 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1, -1.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0, 2.3, -1.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, -1.0, 2.3, -1.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, -1.5, 1.1, -1.0, 1.0, 1.0, 1.1, -1.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.6, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 1.1, -1.0, 0.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, -1.0, 0.0, -1.0, -3.0, -1.0, 0.0, 2.2, 3.0, -1.0, 0.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, -2.2, 3.3, 1.1, 1.0, -2.2, 3.3};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.1, 1.0, 1.0, 2.0, 1.1, 1.0, 2.2, 2.7, 1.1, 1.0, -3.3, -5.9};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(-0.2, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {0.0, 1.0, 2.2, 3.3, 0.0, 0.0, 1.0, 2.2, 3.3, 0.0};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(13.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {0.0, 3.0, 1.0, 2.2, 1.0, -2.2, 1.0, 2.2, 2.2, -1.7, 1.0, 2.2, 3.3, 14.5, 1.0, 2.2, 0.0, -9.0, 1.0, 2.2};
+
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(11.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = BLASFUNC(dzsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1, -1.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0, 2.3, -1.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, -1.0, 2.3, -1.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, -1.5, 1.1, -1.0, 1.0, 1.0, 1.1, -1.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.6, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 1.1, -1.0, 0.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(4.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, -1.0, 0.0, -1.0, -3.0, -1.0, 0.0, 2.2, 3.0, -1.0, 0.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.3, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, -2.2, 3.3, 1.1, 1.0, -2.2, 3.3};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.4, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.1, 1.0, 1.0, 2.0, 1.1, 1.0, 2.2, 2.7, 1.1, 1.0, -3.3, -5.9};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(-0.2, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {0.0, 1.0, 2.2, 3.3, 0.0, 0.0, 1.0, 2.2, 3.3, 0.0};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(13.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {0.0, 3.0, 1.0, 2.2, 1.0, -2.2, 1.0, 2.2, 2.2, -1.7, 1.0, 2.2, 3.3, 14.5, 1.0, 2.2, 0.0, -9.0, 1.0, 2.2};
+
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(11.1, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test dzsum by comparing it against pre-calculated values
+ */
+CTEST(dzsum, c_api_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0 : 1.0;
+   }
+   double sum = cblas_dzsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0, sum, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_icamin.c b/utest/test_extensions/test_icamin.c
new file mode 100644
index 000000000..cca464eac
--- /dev/null
+++ b/utest/test_extensions/test_icamin.c
@@ -0,0 +1,625 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * 2; i ++) {
+      x[i] = i - 1000;
+   }
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.0f, 2.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.0f, -2.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc * 2] = 0.0f;
+   x[(N - 1) * inc * 2 + 1] = 0.0f;
+   blasint index = BLASFUNC(icamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_bad_args_N_0){
+    blasint i;
+    blasint N = 0, inc = 1;
+    float x[ELEMENTS * 2];
+    for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+        x[i] = 1000 - i;
+    }
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_step_zero){
+    blasint i;
+    blasint N = ELEMENTS, inc = 0;
+    float x[ELEMENTS * 2];
+    for (i = 0; i < N * 2; i ++) {
+        x[i] = i - 1000;
+    }
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_1){
+    blasint N = 1, inc = 1;
+    float x[] = {1.0f, 2.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_1){
+    blasint N = 1, inc = 1;
+    float x[] = {-1.0f, -2.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_1){
+    blasint N = 1, inc = 2;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_1){
+    blasint N = 1, inc = 2;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_2){
+    blasint N = 2, inc = 1;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_2){
+    blasint N = 2, inc = 1;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_2){
+    blasint N = 2, inc = 2;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_2){
+    blasint N = 2, inc = 2;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_3){
+    blasint N = 3, inc = 1;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_3){
+    blasint N = 3, inc = 1;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_3){
+    blasint N = 3, inc = 2;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_3){
+    blasint N = 3, inc = 2;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_4){
+    blasint N = 4, inc = 1;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_4){
+    blasint N = 4, inc = 1;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_4){
+    blasint N = 4, inc = 2;
+    float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_4){
+    blasint N = 4, inc = 2;
+    float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    float x[ELEMENTS * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = i + 1000;
+    }
+    x[7 * inc * 2] = 0.0f;
+    x[7 * inc * 2 + 1] = 0.0f;
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    float x[ELEMENTS * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = - i - 1000;
+    }
+    x[7 * inc * 2] = 0.0f;
+    x[7 * inc * 2 + 1] = 0.0f;
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_positive_step_2_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = INCREMENT;
+    float x[ELEMENTS * INCREMENT * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = i + 1000;
+    }
+    x[7 * inc * 2] = 0.0f;
+    x[7 * inc * 2 + 1] = 0.0f;
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_negative_step_2_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = INCREMENT;
+    float x[ELEMENTS * INCREMENT * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = - i - 1000;
+    }
+    x[7 * inc * 2] = 0.0f;
+    x[7 * inc * 2 + 1] = 0.0f;
+    blasint index = cblas_icamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test icamin by comparing it against pre-calculated values
+ */
+CTEST(icamin, c_api_min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc * 2] = 0.0f;
+   x[(N - 1) * inc * 2 + 1] = 0.0f;
+   blasint index = cblas_icamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_idamin.c b/utest/test_extensions/test_idamin.c
new file mode 100644
index 000000000..bebe76dba
--- /dev/null
+++ b/utest/test_extensions/test_idamin.c
@@ -0,0 +1,789 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_DOUBLE
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.1};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.1, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, 1.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.1, 1.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0};
+
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, min_idx_in_vec_tail_inc_1){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double *x = (double*)malloc(ELEMENTS * inc * sizeof(double));
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = BLASFUNC(idamin)(&N, x, &inc);
+   free(x);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS];
+   for (i = 0; i < N; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.1};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.1};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.1, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.1, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.1, 1.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.1, 1.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.1, 1.0, 2.2};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {1.1, 1.0, 2.2, 3.3, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   double x[] = {-1.1, 1.0, -2.2, -3.3, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {1.1, 0.0, 1.0, 0.0, 2.2, 0.0, 3.3, 0.0, 0.0, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   double x[] = {-1.1, 0.0, 1.0, 0.0, -2.2, 0.0, -3.3, 0.0, 0.0, 0.0};
+
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+
+/**
+ * C API specific test
+ * Test idamin by comparing it against pre-calculated values
+ */
+CTEST(idamin, c_api_min_idx_in_vec_tail_inc_1){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double *x = (double*) malloc(ELEMENTS * inc * sizeof(double));
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0;
+   blasint index = cblas_idamin(N, x, inc);
+   free(x);
+   ASSERT_EQUAL(N - 1, index);
+}
+#endif
diff --git a/utest/test_extensions/test_isamin.c b/utest/test_extensions/test_isamin.c
new file mode 100644
index 000000000..d93813e6f
--- /dev/null
+++ b/utest/test_extensions/test_isamin.c
@@ -0,0 +1,789 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_SINGLE
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.1f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.1f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, 1.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.1f, 1.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f};
+
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(5, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(9, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, min_idx_in_vec_tail_inc_1){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float *x = (float*) malloc(ELEMENTS * inc * sizeof(float));
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = BLASFUNC(isamin)(&N, x, &inc);
+   free(x);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.1f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.1f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, 1.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.1f, 1.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f};
+
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(4, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    float x[ELEMENTS];
+    for (i = 0; i < N * inc; i ++) {
+        x[i] = i + 1000;
+    }
+
+    x[8 * inc] = 0.0f;
+    blasint index = cblas_isamin(N, x, inc);
+    ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    float x[ELEMENTS];
+    for (i = 0; i < N  * inc; i ++) {
+        x[i] = - i - 1000;
+    }
+
+    x[8 * inc] = -1.0f;
+    blasint index = cblas_isamin(N, x, inc);
+    ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+
+/**
+ * C API specific test
+ * Test isamin by comparing it against pre-calculated values
+ */
+CTEST(isamin, c_api_min_idx_in_vec_tail_inc_1){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float *x = (float*)malloc(ELEMENTS * inc * sizeof(float));
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc] = 0.0f;
+   blasint index = cblas_isamin(N, x, inc);
+   free(x);
+   ASSERT_EQUAL(N - 1, index);
+}
+#endif
diff --git a/utest/test_extensions/test_izamin.c b/utest/test_extensions/test_izamin.c
new file mode 100644
index 000000000..a0bdae8e2
--- /dev/null
+++ b/utest/test_extensions/test_izamin.c
@@ -0,0 +1,625 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * 2; i ++) {
+      x[i] = i - 1000;
+   }
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(0, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {1.0, 2.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   double x[] = {-1.0, -2.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(1, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(2, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   double x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, positive_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, negative_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0;
+   x[7 * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(8, index);
+}
+
+/**
+ * Fortran API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc * 2] = 0.0;
+   x[(N - 1) * inc * 2 + 1] = 0.0;
+   blasint index = BLASFUNC(izamin)(&N, x, &inc);
+   ASSERT_EQUAL(N, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_bad_args_N_0){
+    blasint i;
+    blasint N = 0, inc = 1;
+    double x[ELEMENTS * 2];
+    for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+        x[i] = 1000 - i;
+    }
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_step_zero){
+    blasint i;
+    blasint N = ELEMENTS, inc = 0;
+    double x[ELEMENTS * 2];
+    for (i = 0; i < N * 2; i ++) {
+        x[i] = i - 1000;
+    }
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_1){
+    blasint N = 1, inc = 1;
+    double x[] = {1.0, 2.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_1){
+    blasint N = 1, inc = 1;
+    double x[] = {-1.0, -2.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_1){
+    blasint N = 1, inc = 2;
+    double x[] = {1.0, 2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_1){
+    blasint N = 1, inc = 2;
+    double x[] = {-1.0, -2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(0, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_2){
+    blasint N = 2, inc = 1;
+    double x[] = {1.0, 2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_2){
+    blasint N = 2, inc = 1;
+    double x[] = {-1.0, -2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_2){
+    blasint N = 2, inc = 2;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_2){
+    blasint N = 2, inc = 2;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_3){
+    blasint N = 3, inc = 1;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_3){
+    blasint N = 3, inc = 1;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_3){
+    blasint N = 3, inc = 2;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_3){
+    blasint N = 3, inc = 2;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_4){
+    blasint N = 4, inc = 1;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 2.0, 1.0, -2.0, -2.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_4){
+    blasint N = 4, inc = 1;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -2.0, -1.0, -2.0, -2.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_4){
+    blasint N = 4, inc = 2;
+    double x[] = {1.0, 2.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 2.0, 1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_4){
+    blasint N = 4, inc = 2;
+    double x[] = {-1.0, -2.0, 0.0, 0.0, -1.0, -1.0, 0.0, 0.0, -2.0, -1.0, 0.0, 0.0, -2.0, -2.0, 0.0, 0.0};
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(1, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    double x[ELEMENTS * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = i + 1000;
+    }
+    x[7 * inc * 2] = 0.0;
+    x[7 * inc * 2 + 1] = 0.0;
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_1_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = 1;
+    double x[ELEMENTS * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = - i - 1000;
+    }
+    x[7 * inc * 2] = 0.0;
+    x[7 * inc * 2 + 1] = 0.0;
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_positive_step_2_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = INCREMENT;
+    double x[ELEMENTS * INCREMENT * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = i + 1000;
+    }
+    x[7 * inc * 2] = 0.0;
+    x[7 * inc * 2 + 1] = 0.0;
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_negative_step_2_N_50){
+    blasint i;
+    blasint N = ELEMENTS, inc = INCREMENT;
+    double x[ELEMENTS * INCREMENT * 2];
+    for (i = 0; i < N * inc * 2; i ++) {
+        x[i] = - i - 1000;
+    }
+    x[7 * inc * 2] = 0.0;
+    x[7 * inc * 2 + 1] = 0.0;
+    blasint index = cblas_izamin(N, x, inc);
+    ASSERT_EQUAL(7, index);
+}
+
+/**
+ * C API specific test
+ * Test izamin by comparing it against pre-calculated values
+ */
+CTEST(izamin, c_api_min_idx_in_vec_tail){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   double x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[(N - 1) * inc * 2] = 0.0;
+   x[(N - 1) * inc * 2 + 1] = 0.0;
+   blasint index = cblas_izamin(N, x, inc);
+   ASSERT_EQUAL(N - 1, index);
+}
+#endif
diff --git a/utest/test_extensions/test_samin.c b/utest/test_extensions/test_samin.c
new file mode 100644
index 000000000..fd34d462a
--- /dev/null
+++ b/utest/test_extensions/test_samin.c
@@ -0,0 +1,354 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_SINGLE
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N; i ++) {
+      x[i] = i - 1000;
+   }
+   x[8] = 0.0;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(fabsf(x[0]), amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.1f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.1f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, 1.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.1f, 1.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {-1.1f, 1.0f, -2.2f, -3.3f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 0.0f, 2.2f, 0.0f, 3.3f, 0.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {-1.1f, 0.0f, 1.0f, 0.0f, -2.2f, 0.0f, -3.3f, 0.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 0.0f;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = i + 1000;
+   }
+
+   x[8 * inc] = 1.0f;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test samin by comparing it against pre-calculated values
+ */
+CTEST(samin, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = - i - 1000;
+   }
+
+   x[8 * inc] = -1.0f;
+   float amin = BLASFUNC(samin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, amin, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_saxpby.c b/utest/test_extensions/test_saxpby.c
new file mode 100644
index 000000000..b4bd5cf0b
--- /dev/null
+++ b/utest/test_extensions/test_saxpby.c
@@ -0,0 +1,794 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_SAXPBY {
+    float x_test[DATASIZE * INCREMENT];
+    float x_verify[DATASIZE * INCREMENT];
+    float y_test[DATASIZE * INCREMENT];
+    float y_verify[DATASIZE * INCREMENT];
+};
+#ifdef BUILD_SINGLE
+static struct DATA_SAXPBY data_saxpby;
+
+/**
+ * Fortran API specific function
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float check_saxpby(blasint n, float alpha, blasint incx, float beta, blasint incy)
+{
+    blasint i;
+
+    // sscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    srand_generate(data_saxpby.x_test, n * incx_abs);
+    srand_generate(data_saxpby.y_test, n * incy_abs);
+
+    // Copy vector x for saxpy
+    for (i = 0; i < n * incx_abs; i++)
+        data_saxpby.x_verify[i] = data_saxpby.x_test[i];
+
+    // Copy vector y for sscal
+    for (i = 0; i < n * incy_abs; i++)
+        data_saxpby.y_verify[i] = data_saxpby.y_test[i];
+
+    // Find beta*y
+    BLASFUNC(sscal)(&n, &beta, data_saxpby.y_verify, &incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    BLASFUNC(saxpy)(&n, &alpha, data_saxpby.x_verify, &incx,
+                        data_saxpby.y_verify, &incy);
+    
+    BLASFUNC(saxpby)(&n, &alpha, data_saxpby.x_test, &incx,
+                        &beta, data_saxpby.y_test, &incy);
+
+    // Find the differences between output vector caculated by saxpby and saxpy
+    for (i = 0; i < n * incy_abs; i++)
+        data_saxpby.y_test[i] -= data_saxpby.y_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(snrm2)(&n, data_saxpby.y_test, &incy_abs);
+}
+
+/**
+ * C API specific function
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static float c_api_check_saxpby(blasint n, float alpha, blasint incx, float beta, blasint incy)
+{
+    blasint i;
+
+    // sscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Copy vector x for saxpy
+    for (i = 0; i < n * incx_abs; i++)
+        data_saxpby.x_verify[i] = data_saxpby.x_test[i];
+
+    // Copy vector y for sscal
+    for (i = 0; i < n * incy_abs; i++)
+        data_saxpby.y_verify[i] = data_saxpby.y_test[i];
+
+    // Find beta*y
+    cblas_sscal(n, beta, data_saxpby.y_verify, incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    cblas_saxpy(n, alpha, data_saxpby.x_verify, incx,
+                        data_saxpby.y_verify, incy);
+    
+    cblas_saxpby(n, alpha, data_saxpby.x_test, incx,
+                        beta, data_saxpby.y_test, incy);
+
+    // Find the differences between output vector caculated by saxpby and saxpy
+    for (i = 0; i < n * incy_abs; i++)
+        data_saxpby.y_test[i] -= data_saxpby.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_snrm2(n, data_saxpby.y_test, incy_abs);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(saxpby, inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(saxpby, inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha = 3.0f;
+    float beta = 4.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    float alpha = 5.0f;
+    float beta = 4.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(saxpby, inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    float alpha = 1.0f;
+    float beta = 6.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(saxpby, inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    float alpha = 7.0f;
+    float beta = 3.5f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(saxpby, inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+*/
+CTEST(saxpby, inc_x_1_inc_y_2_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(saxpby, inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * Scalar beta is zero
+*/
+CTEST(saxpby, inc_x_2_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(saxpby, inc_x_1_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(saxpby, inc_x_2_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(saxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(saxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(saxpby, check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_saxpby(n, alpha, incx, beta, incy);
+    
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(saxpby, c_api_inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, c_api_inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha = 3.0f;
+    float beta = 4.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(saxpby, c_api_inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    float alpha = 5.0f;
+    float beta = 4.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(saxpby, c_api_inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    float alpha = 1.0f;
+    float beta = 6.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(saxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    float alpha = 7.0f;
+    float beta = 3.5f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+*/
+CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * Scalar beta is zero
+*/
+CTEST(saxpby, c_api_inc_x_2_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * Scalar beta is zero
+*/
+CTEST(saxpby, c_api_inc_x_2_inc_y_2_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(saxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test saxpby by comparing it with sscal and saxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(saxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(saxpby, c_api_check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = c_api_check_saxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_scamax.c b/utest/test_extensions/test_scamax.c
new file mode 100644
index 000000000..8c214ddff
--- /dev/null
+++ b/utest/test_extensions/test_scamax.c
@@ -0,0 +1,293 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, step_zero){
+   blasint i;
+   blasint N = ELEMENTS * 2, inc = 0;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N; i ++) {
+      x[i] = i - 1000;
+   }
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL((fabsf(x[0]) + fabsf(x[1])), amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.0f, 2.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.0f, -2.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -3.0f, -1.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 3.0f, 1.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -3.0f, -1.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i;
+   }
+   x[7 * inc * 2] = 1000.0f;
+   x[7 * inc * 2 + 1] = 1000.0f;
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = -i;
+   }
+   x[7 * inc * 2] = 1000.0f;
+   x[7 * inc * 2 + 1] = 1000.0f;
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i;
+   }
+   x[7 * inc * 2] = 1000.0f;
+   x[7 * inc * 2 + 1] = 1000.0f;
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS);
+}
+
+/**
+ * Test scamax by comparing it against pre-calculated values
+ */
+CTEST(scamax, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = -i;
+   }
+   x[7 * inc * 2] = 1000.0f;
+   x[7 * inc * 2 + 1] = 1000.0f;
+   float amax = BLASFUNC(scamax)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2000.0f, amax, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_scamin.c b/utest/test_extensions/test_scamin.c
new file mode 100644
index 000000000..507548f2a
--- /dev/null
+++ b/utest/test_extensions/test_scamin.c
@@ -0,0 +1,310 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 70
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, step_zero){
+   blasint i;
+   blasint N = ELEMENTS * 2, inc = 0;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N; i ++) {
+      x[i] = i - 1000;
+   }
+   x[8] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL((fabsf(x[0]) + fabsf(x[1])), amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.0f, 2.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {-1.0f, -2.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 2.0f, 1.0f, -2.0f, -2.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -2.0f, -1.0f, -2.0f, -2.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.0f, 2.0f, 0.0f, 0.0f, 1.0f, 1.0f, 0.0f, 0.0f, 2.0f, 1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {-1.0f, -2.0f, 0.0f, 0.0f, -1.0f, -1.0f, 0.0f, 0.0f, -2.0f, -1.0f, 0.0f, 0.0f, -2.0f, -2.0f, 0.0f, 0.0f};
+
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_1_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, positive_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+
+/**
+ * Test scamin by comparing it against pre-calculated values
+ */
+CTEST(scamin, negative_step_2_N_70){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = - i - 1000;
+   }
+   x[7 * inc * 2] = 0.0f;
+   x[7 * inc * 2 + 1] = 0.0f;
+   float amin = BLASFUNC(scamin)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, amin, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_scsum.c b/utest/test_extensions/test_scsum.c
new file mode 100644
index 000000000..492e1a4ca
--- /dev/null
+++ b/utest/test_extensions/test_scsum.c
@@ -0,0 +1,403 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_COMPLEX
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f, -1.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f, 2.3f, -1.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, -1.0f, 2.3f, -1.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, -1.5f, 1.1f, -1.0f, 1.0f, 1.0f, 1.1f, -1.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.6f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 1.1f, -1.0f, 0.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, -1.0f, 0.0f, -1.0f, -3.0f, -1.0f, 0.0f, 2.2f, 3.0f, -1.0f, 0.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, -2.2f, 3.3f, 1.1f, 1.0f, -2.2f, 3.3f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.1f, 1.0f, 1.0f, 2.0f, 1.1f, 1.0f, 2.2f, 2.7f, 1.1f, 1.0f, -3.3f, -5.9f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(-0.2f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f, 0.0f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(13.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {0.0f, 3.0f, 1.0f, 2.2f, 1.0f, -2.2f, 1.0f, 2.2f, 2.2f, -1.7f, 1.0f, 2.2f, 3.3f, 14.5f, 1.0f, 2.2f, 0.0f, -9.0f, 1.0f, 2.2f};
+
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(11.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = BLASFUNC(scsum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < ELEMENTS * inc * 2; i ++) {
+      x[i] = 1000 - i;
+   }
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc * 2; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f, -1.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f, 2.3f, -1.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, -1.0f, 2.3f, -1.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, -1.5f, 1.1f, -1.0f, 1.0f, 1.0f, 1.1f, -1.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.6f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f, 1.1f, -1.0f, 0.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(4.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, -1.0f, 0.0f, -1.0f, -3.0f, -1.0f, 0.0f, 2.2f, 3.0f, -1.0f, 0.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, -2.2f, 3.3f, 1.1f, 1.0f, -2.2f, 3.3f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.4f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.1f, 1.0f, 1.0f, 2.0f, 1.1f, 1.0f, 2.2f, 2.7f, 1.1f, 1.0f, -3.3f, -5.9f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(-0.2f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f, 0.0f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(13.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {0.0f, 3.0f, 1.0f, 2.2f, 1.0f, -2.2f, 1.0f, 2.2f, 2.2f, -1.7f, 1.0f, 2.2f, 3.3f, 14.5f, 1.0f, 2.2f, 0.0f, -9.0f, 1.0f, 2.2f};
+
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(11.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test scsum by comparing it against pre-calculated values
+ */
+CTEST(scsum, c_api_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT * 2];
+   for (i = 0; i < N * inc * 2; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = cblas_scsum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_sgeadd.c b/utest/test_extensions/test_sgeadd.c
new file mode 100644
index 000000000..171132b9d
--- /dev/null
+++ b/utest/test_extensions/test_sgeadd.c
@@ -0,0 +1,880 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+
+struct DATA_SGEADD
+{
+    float a_test[M * N];
+    float c_test[M * N];
+    float c_verify[M * N];
+};
+
+#ifdef BUILD_SINGLE
+static struct DATA_SGEADD data_sgeadd;
+
+/**
+ * sgeadd reference implementation
+ *
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param aptr - refer to matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param cptr - refer to matrix C
+ * param ldc - leading dimension of C
+ */
+static void sgeadd_trusted(blasint m, blasint n, float alpha, float *aptr,
+                           blasint lda, float beta, float *cptr, blasint ldc)
+{
+    blasint i;
+
+    for (i = 0; i < n; i++)
+    {
+        cblas_saxpby(m, alpha, aptr, 1, beta, cptr, 1);
+        aptr += lda;
+        cptr += ldc;
+    }
+}
+
+/**
+ * Test sgeadd by comparing it against reference
+ * Compare with the following options:
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static float check_sgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                          blasint m, blasint n, float alpha, blasint lda,
+                          float beta, blasint ldc)
+{
+    blasint i;
+    blasint cols = m, rows = n;
+
+    if (order == CblasRowMajor)
+    {
+        rows = m;
+        cols = n;
+    }
+
+    // Fill matrix A, C
+    srand_generate(data_sgeadd.a_test, lda * rows);
+    srand_generate(data_sgeadd.c_test, ldc * rows);
+
+    // Copy matrix C for sgeadd
+    for (i = 0; i < ldc * rows; i++)
+        data_sgeadd.c_verify[i] = data_sgeadd.c_test[i];
+
+    sgeadd_trusted(cols, rows, alpha, data_sgeadd.a_test, lda,
+                   beta, data_sgeadd.c_verify, ldc);
+
+    if (api == 'F')
+        BLASFUNC(sgeadd)
+        (&m, &n, &alpha, data_sgeadd.a_test, &lda,
+         &beta, data_sgeadd.c_test, &ldc);
+    else
+        cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda,
+                     beta, data_sgeadd.c_test, ldc);
+
+    // Find the differences between output matrix caculated by sgeadd and sgemm
+    return smatrix_difference(data_sgeadd.c_test, data_sgeadd.c_verify, cols, rows, ldc);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param lda - leading dimension of A
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in sgeadd
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                         blasint m, blasint n, blasint lda,
+                         blasint ldc, int expected_info)
+{
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    set_xerbla("SGEADD ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(sgeadd)
+        (&m, &n, &alpha, data_sgeadd.a_test, &lda,
+         &beta, data_sgeadd.c_test, &ldc);
+    else
+        cblas_sgeadd(order, m, n, alpha, data_sgeadd.a_test, lda,
+                     beta, data_sgeadd.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(sgeadd, matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 3.0f;
+    float beta = 3.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(sgeadd, matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 0.0f;
+    float beta = 2.5f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(sgeadd, matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 3.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(sgeadd, matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(sgeadd, matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C
+ * Must be at least zero.
+ */
+CTEST(sgeadd, xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ */
+CTEST(sgeadd, xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ */
+CTEST(sgeadd, xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ */
+CTEST(sgeadd, xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - number of columns of A, C equal zero.
+ */
+CTEST(sgeadd, n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if m - number of rows of A and C equal zero.
+ */
+CTEST(sgeadd, m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 2.0f;
+    float beta = 3.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 4.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(sgeadd, c_api_matrix_n_50_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N / 2;
+    blasint m = M;
+
+    blasint lda = n;
+    blasint ldc = n;
+
+    float alpha = 3.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 0.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 3.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 0.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(sgeadd, c_api_matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    float alpha = 3.0f;
+    float beta = 4.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                              lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order -
+ * specifies whether A and C stored in
+ * row-major order or column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_invalid_order)
+{
+    CBLAS_ORDER order = INVALID;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 0;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(sgeadd, c_api_xerbla_n_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(sgeadd, c_api_xerbla_m_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(sgeadd, c_api_xerbla_lda_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(sgeadd, c_api_xerbla_ldc_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Check if n - number of columns of A, C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if m - number of rows of A and C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(sgeadd, c_api_m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    float alpha = 1.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgeadd('C', order, m, n, alpha,
+                                    lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_sgemmt.c b/utest/test_extensions/test_sgemmt.c
new file mode 100644
index 000000000..5b51e3579
--- /dev/null
+++ b/utest/test_extensions/test_sgemmt.c
@@ -0,0 +1,1442 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_SGEMMT {
+    float a_test[DATASIZE * DATASIZE];
+    float b_test[DATASIZE * DATASIZE];
+    float c_test[DATASIZE * DATASIZE];
+    float c_verify[DATASIZE * DATASIZE];
+    float c_gemm[DATASIZE * DATASIZE];
+};
+
+#ifdef BUILD_SINGLE
+static struct DATA_SGEMMT data_sgemmt;
+
+/**
+ * Compute gemmt via gemm since gemmt is gemm but updates only 
+ * the upper or lower triangular part of the result matrix
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ */
+static void sgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                           char transb, blasint m, blasint k, float alpha, blasint lda, 
+                           blasint ldb, float beta, blasint ldc)
+{
+    blasint i, j;
+
+    if(api == 'F')
+        BLASFUNC(sgemm)(&transa, &transb, &m, &m, &k, &alpha, data_sgemmt.a_test, &lda,
+                        data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_gemm, &ldc);
+    else
+        cblas_sgemm(order, transa, transb, m, m, k, alpha, data_sgemmt.a_test, lda,
+                data_sgemmt.b_test, ldb, beta, data_sgemmt.c_gemm, ldc);
+
+    if (uplo == 'L' || uplo == CblasLower)
+    {
+        for (i = 0; i < m; i++)
+            for (j = i; j < m; j++)
+                data_sgemmt.c_verify[i * ldc + j] =
+                    data_sgemmt.c_gemm[i * ldc + j];
+    } else {
+        for (i = 0; i < m; i++)
+            for (j = 0; j <= i; j++)
+                data_sgemmt.c_verify[i * ldc + j] =
+                    data_sgemmt.c_gemm[i * ldc + j];
+    }
+}
+
+/**
+ * Comapare results computed by sgemmt and sgemmt_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static float check_sgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                          char transb, blasint m, blasint k, float alpha, blasint lda, 
+                          blasint ldb, float beta, blasint ldc)
+{
+    blasint i;
+    blasint b_cols;
+    blasint a_cols;
+    blasint inc = 1;
+    blasint size_c = m * ldc;
+
+    if(order == CblasColMajor){
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = m;
+        else a_cols = k;
+
+        if (transb == 'T' || transb == 'C' || 
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = k;
+        else b_cols = m;
+    } else {
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = k;
+        else a_cols = m;
+
+        if (transb == 'T' || transb == 'C' ||
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = m;
+        else b_cols = k;
+    }
+
+    srand_generate(data_sgemmt.a_test, a_cols * lda);
+    srand_generate(data_sgemmt.b_test, b_cols * ldb);
+    srand_generate(data_sgemmt.c_test, m * ldc);
+
+    for (i = 0; i < m * ldc; i++)
+        data_sgemmt.c_gemm[i] = data_sgemmt.c_verify[i] = data_sgemmt.c_test[i];
+
+    sgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc);
+
+    if (api == 'F')
+        BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test,
+                         &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc);
+    else
+        cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda,
+                    data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc);
+
+    for (i = 0; i < m * ldc; i++)
+        data_sgemmt.c_verify[i] -= data_sgemmt.c_test[i];
+
+    return BLASFUNC(snrm2)(&size_c, data_sgemmt.c_verify, &inc) / size_c;
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in sgemmt
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                         char transb, blasint m, blasint k, blasint lda, blasint ldb,
+                         blasint ldc, int expected_info)
+{
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    set_xerbla("SGEMMT ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(sgemmt)(&uplo, &transa, &transb, &m, &k, &alpha, data_sgemmt.a_test,
+                         &lda, data_sgemmt.b_test, &ldb, &beta, data_sgemmt.c_test, &ldc);
+    else
+        cblas_sgemmt(order, uplo, transa, transb, m, k, alpha, data_sgemmt.a_test, lda,
+                    data_sgemmt.b_test, ldb, beta, data_sgemmt.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'T';
+    char uplo = 'U';
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'U';
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    char transa = 'T', transb = 'N';
+    char uplo = 'L';
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'T';
+    char uplo = 'L';
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'L';
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_colmajor_upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_colmajor_upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, c_api_colmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, c_api_colmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_colmajor_lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 100;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_colmajor_lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, c_api_colmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, c_api_colmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 100;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, c_api_rowmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_M_100_K_50_a_trans_b_notrans)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 100;
+    float alpha = 1.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_100_a_notrans_b_trans)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    float alpha = 1.0f;
+    float beta = 0.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 1.5f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * alpha = 0.0
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 0.0f;
+    float beta = 2.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test sgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * beta = 1.0
+ */
+CTEST(sgemmt, c_api_rowmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    float alpha = 2.0f;
+    float beta = 1.0f;
+
+    float norm = check_sgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param uplo.
+ * Must be upper (U) or lower (L).
+ */
+CTEST(sgemmt, xerbla_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transa.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(sgemmt, xerbla_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'O', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transb.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(sgemmt, xerbla_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'O';
+    char uplo = 'U';
+    int expected_info = 3;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 4;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 5;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(sgemmt, xerbla_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 8;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(sgemmt, xerbla_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 10;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(sgemmt, xerbla_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 13;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test.
+ * Test error function for an invalid param order.
+ * Must be column or row major.
+ */
+CTEST(sgemmt, xerbla_c_api_major_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 0;
+
+    int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(sgemmt, xerbla_c_api_colmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B transposed.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(sgemmt, xerbla_c_api_rowmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_simatcopy.c b/utest/test_extensions/test_simatcopy.c
new file mode 100644
index 000000000..ba388596d
--- /dev/null
+++ b/utest/test_extensions/test_simatcopy.c
@@ -0,0 +1,915 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_SIMATCOPY {
+    float a_test[DATASIZE* DATASIZE];
+    float a_verify[DATASIZE* DATASIZE];
+};
+
+#ifdef BUILD_SINGLE
+static struct DATA_SIMATCOPY data_simatcopy;
+
+/**
+ * Comapare results computed by simatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * return norm of difference between openblas and reference func
+ */
+static float check_simatcopy(char api, char order, char trans, blasint rows, blasint cols, float alpha, 
+                             blasint lda_src, blasint lda_dst)
+{
+    blasint m, n;
+    blasint rows_out, cols_out;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    if (order == 'C') {
+        n = rows; m = cols;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        rows_out = n; cols_out = m;
+    }
+    else {
+        rows_out = m; cols_out = n;
+    }
+
+    srand_generate(data_simatcopy.a_test, lda_src*m);
+
+    if (trans == 'T' || trans == 'C') {
+        stranspose(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst);
+    } 
+    else {
+        scopy(m, n, alpha, data_simatcopy.a_test, lda_src, data_simatcopy.a_verify, lda_dst);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, 
+                            &lda_src, &lda_dst);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_simatcopy(corder, ctrans, rows, cols, alpha, data_simatcopy.a_test, 
+                    lda_src, lda_dst);
+    }
+
+    // Find the differences between output matrix computed by simatcopy and reference func
+    return smatrix_difference(data_simatcopy.a_test, data_simatcopy.a_verify, cols_out, rows_out, lda_dst);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda_src, blasint lda_dst, int expected_info)
+{
+    float alpha = 1.0f;
+
+    set_xerbla("SIMATCOPY", expected_info);
+
+    BLASFUNC(simatcopy)(&order, &trans, &rows, &cols, &alpha, data_simatcopy.a_test, 
+                        &lda_src, &lda_dst);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_50_row_100_alpha_one)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_50_row_100_alpha_one)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_50_row_100_alpha_zero)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_50_row_100_alpha_zero)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_100_alpha_one)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_100_alpha_zero)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_50_alpha_one)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 1.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_50_alpha_one)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_50_alpha_zero)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_50_alpha_zero)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, rowmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test simatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 2.0f
+ */
+CTEST(simatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_simatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(simatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(simatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using row major layout, 
+ * lda_src must be at least n.
+ */
+CTEST(simatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using column major layout,
+ * lda_src must be at least m.
+ */
+CTEST(simatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, lda_dst must be at least n.
+ */
+CTEST(simatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is transposition, lda_dst must be at least m.
+ */
+CTEST(simatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, lda_dst must be at least m.
+ */
+CTEST(simatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is transposition, lda_dst must be at least n.
+ */
+CTEST(simatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_somatcopy.c b/utest/test_extensions/test_somatcopy.c
new file mode 100644
index 000000000..b53c7cae5
--- /dev/null
+++ b/utest/test_extensions/test_somatcopy.c
@@ -0,0 +1,640 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_SOMATCOPY {
+    float a_test[DATASIZE * DATASIZE];
+    float b_test[DATASIZE * DATASIZE];
+    float b_verify[DATASIZE * DATASIZE];
+};
+
+#ifdef BUILD_SINGLE
+static struct DATA_SOMATCOPY data_somatcopy;
+
+/**
+ * Comapare results computed by somatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param alpha - scaling factor for matrix B
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * return norm of difference between openblas and reference func
+ */
+static float check_somatcopy(char api, char order, char trans, blasint rows, blasint cols, float alpha, 
+                             blasint lda, blasint ldb)
+{
+    blasint b_rows, b_cols;
+    blasint m, n;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    if (order == 'C') {
+        m = cols; n = rows;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        b_rows = n; b_cols = m;
+    }
+    else {
+        b_rows = m; b_cols = n;
+    }
+
+    srand_generate(data_somatcopy.a_test, lda*m);
+
+    if (trans == 'T' || trans == 'C') {
+        stranspose(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb);
+    } 
+    else {
+        scopy(m, n, alpha, data_somatcopy.a_test, lda, data_somatcopy.b_verify, ldb);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, 
+                            &lda, data_somatcopy.b_test, &ldb);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_somatcopy(corder, ctrans, rows, cols, alpha, data_somatcopy.a_test, 
+                    lda, data_somatcopy.b_test, ldb);
+    }
+    
+    return smatrix_difference(data_somatcopy.b_test, data_somatcopy.b_verify, b_cols, b_rows, ldb);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda, blasint ldb, int expected_info)
+{
+    float alpha = 1.0;
+
+    set_xerbla("SOMATCOPY", expected_info);
+
+    BLASFUNC(somatcopy)(&order, &trans, &rows, &cols, &alpha, data_somatcopy.a_test, 
+                        &lda, data_somatcopy.b_test, &ldb);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(somatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 2.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(somatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 2.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(somatcopy, colmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 0.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(somatcopy, colmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 0.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(somatcopy, rowmajor_conjtrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'C'; // same as trans for real matrix
+    float alpha = 2.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 2.0
+ */
+CTEST(somatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha = 2.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Matrix dimensions leave residues from 4 and 2 (specialize
+ * for rt case)
+ * alpha = 1.5
+ */
+CTEST(somatcopy, rowmajor_trans_col_27_row_27)
+{
+    blasint m = 27, n = 27;
+    blasint lda = 27, ldb = 27;
+    char order = 'R';
+    char trans = 'T'; 
+    float alpha = 1.5f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha = 0.0
+ */
+CTEST(somatcopy, rowmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N'; 
+    float alpha = 0.0f;
+
+    float norm = check_somatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test somatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha = 1.0
+ */
+CTEST(somatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'N';
+    float alpha = 1.0f;
+
+    float norm = check_somatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0f, norm, SINGLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(somatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(somatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using row major layout,
+ * lda must be at least n.
+ */
+CTEST(somatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda.
+ * If matrices are stored using column major layout,
+ * lda must be at least m.
+ */
+CTEST(somatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and
+ * there is no transposition, ldb must be at least n.
+ */
+CTEST(somatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using row major layout and
+ * there is transposition, ldb must be at least m.
+ */
+CTEST(somatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and
+ * there is no transposition, ldb must be at least m.
+ */
+CTEST(somatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param ldb.
+ * If matrices are stored using column major layout and
+ * there is transposition, ldb must be at least n.
+ */
+CTEST(somatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_srotmg.c b/utest/test_extensions/test_srotmg.c
new file mode 100644
index 000000000..3c97e3b4d
--- /dev/null
+++ b/utest/test_extensions/test_srotmg.c
@@ -0,0 +1,414 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#ifdef BUILD_SINGLE
+
+/**
+ * Fortran API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, y1_zero)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 0.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0f;
+	tr_d2 = 2.0f;
+	tr_x1 = 8.0f;
+	tr_y1 = 0.0f;
+
+	tr_param[0] = -2.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, d1_negative)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = -1.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0f;
+	tr_d2 = 0.0f;
+	tr_x1 = 0.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = -1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, d1_positive_d2_positive_x1_zero)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 0.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0f;
+	tr_d2 = 2.0f;
+	tr_x1 = 8.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = 1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * Fortran API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, scaled_y_greater_than_scaled_x)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 1.0f;
+	te_d2 = tr_d2 = -2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0f;
+	tr_d2 = 0.0f;
+	tr_x1 = 0.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = -1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	BLASFUNC(srotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, c_api_y1_zero)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 0.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0f;
+	tr_d2 = 2.0f;
+	tr_x1 = 8.0f;
+	tr_y1 = 0.0f;
+
+	tr_param[0] = -2.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, c_api_d1_negative)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = -1.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0f;
+	tr_d2 = 0.0f;
+	tr_x1 = 0.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = -1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, c_api_d1_positive_d2_positive_x1_zero)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 2.0f;
+	te_d2 = tr_d2 = 2.0f;
+	te_x1 = tr_x1 = 0.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 2.0f;
+	tr_d2 = 2.0f;
+	tr_x1 = 8.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = 1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+
+/**
+ * C API specific test
+ * Test srotmg by comparing it against pre-calculated values
+ */
+CTEST(srotmg, c_api_scaled_y_greater_than_scaled_x)
+{
+	float te_d1, tr_d1;
+	float te_d2, tr_d2;
+	float te_x1, tr_x1;
+	float te_y1, tr_y1;
+	float te_param[5];
+	float tr_param[5];
+	int i = 0;
+	te_d1 = tr_d1 = 1.0f;
+	te_d2 = tr_d2 = -2.0f;
+	te_x1 = tr_x1 = 8.0f;
+	te_y1 = tr_y1 = 8.0f;
+
+	for(i=0; i<5; i++){
+	  te_param[i] = tr_param[i] = 0.0f;
+	}
+	
+	//reference values as calculated by netlib blas
+	tr_d1 = 0.0f;
+	tr_d2 = 0.0f;
+	tr_x1 = 0.0f;
+	tr_y1 = 8.0f;
+
+	tr_param[0] = -1.0f;
+	tr_param[1] = 0.0f;
+	tr_param[2] = 0.0f;
+	tr_param[3] = 0.0f;
+	tr_param[4] = 0.0f;
+
+	//OpenBLAS
+	cblas_srotmg(&te_d1, &te_d2, &te_x1, te_y1, te_param);
+
+	ASSERT_DBL_NEAR_TOL(tr_d1, te_d1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_d2, te_d2, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_x1, te_x1, SINGLE_EPS);
+	ASSERT_DBL_NEAR_TOL(tr_y1, te_y1, SINGLE_EPS);
+
+	for(i=0; i<5; i++){
+		ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], SINGLE_EPS);
+	}
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_ssum.c b/utest/test_extensions/test_ssum.c
new file mode 100644
index 000000000..971a0d2e0
--- /dev/null
+++ b/utest/test_extensions/test_ssum.c
@@ -0,0 +1,403 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#define ELEMENTS 50
+#define INCREMENT 2
+
+#ifdef BUILD_SINGLE
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, -1.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, -1.5f, 1.0f, 1.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(4.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, -1.0f, -3.0f, 2.2f, 3.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, -2.2f, 3.3f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(3.2f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 2.0f, 2.2f, 2.7f, -3.3f, -5.9f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {0.0f, 3.0f, 1.0f, -2.2f, 2.2f, -1.7f, 3.3f, 14.5f, 0.0f, -9.0f};
+
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = BLASFUNC(ssum)(&N, x, &inc);
+   ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_bad_args_N_0){
+   blasint i;
+   blasint N = 0, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < ELEMENTS * inc; i ++) {
+      x[i] = 1000 - i;
+   }
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_zero){
+   blasint i;
+   blasint N = ELEMENTS, inc = 0;
+   float x[ELEMENTS];
+   for (i = 0; i < N  * inc; i ++) {
+      x[i] = i + 1000;
+   }
+   x[8] = 0.0f;
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_1){
+   blasint N = 1, inc = 1;
+   float x[] = {1.1f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_1){
+   blasint N = 1, inc = 2;
+   float x[] = {1.1f, 0.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_2){
+   blasint N = 2, inc = 1;
+   float x[] = {1.1f, -1.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_2){
+   blasint N = 2, inc = 2;
+   float x[] = {1.1f, -1.5f, 1.0f, 1.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.1f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_3){
+   blasint N = 3, inc = 1;
+   float x[] = {1.1f, 1.0f, 2.2f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(4.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_3){
+   blasint N = 3, inc = 2;
+   float x[] = {1.1f, 0.0f, -1.0f, -3.0f, 2.2f, 3.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(2.3f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_4){
+   blasint N = 4, inc = 1;
+   float x[] = {1.1f, 1.0f, -2.2f, 3.3f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(3.2f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_4){
+   blasint N = 4, inc = 2;
+   float x[] = {1.1f, 0.0f, 1.0f, 2.0f, 2.2f, 2.7f, -3.3f, -5.9f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(1.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_5){
+   blasint N = 5, inc = 1;
+   float x[] = {0.0f, 1.0f, 2.2f, 3.3f, 0.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_5){
+   blasint N = 5, inc = 2;
+   float x[] = {0.0f, 3.0f, 1.0f, -2.2f, 2.2f, -1.7f, 3.3f, 14.5f, 0.0f, -9.0f};
+
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(6.5f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_1_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = 1;
+   float x[ELEMENTS];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(0.0f, sum, SINGLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test ssum by comparing it against pre-calculated values
+ */
+CTEST(ssum, c_api_step_2_N_50){
+   blasint i;
+   blasint N = ELEMENTS, inc = INCREMENT;
+   float x[ELEMENTS * INCREMENT];
+   for (i = 0; i < N * inc; i ++) {
+      x[i] = (i & 1) ? -1.0f : 1.0f;
+   }
+   float sum = cblas_ssum(N, x, inc);
+   ASSERT_DBL_NEAR_TOL(50.0f, sum, SINGLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zaxpby.c b/utest/test_extensions/test_zaxpby.c
new file mode 100644
index 000000000..6148f44c5
--- /dev/null
+++ b/utest/test_extensions/test_zaxpby.c
@@ -0,0 +1,630 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZAXPBY {
+    double x_test[DATASIZE * INCREMENT * 2];
+    double x_verify[DATASIZE * INCREMENT * 2];
+    double y_test[DATASIZE * INCREMENT * 2];
+    double y_verify[DATASIZE * INCREMENT * 2];
+};
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZAXPBY data_zaxpby;
+
+/**
+ * Fortran API specific function
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, blasint incy)
+{
+    blasint i;
+
+    // zscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    drand_generate(data_zaxpby.x_test, n * incx_abs * 2);
+    drand_generate(data_zaxpby.y_test, n * incy_abs * 2);
+
+    // Copy vector x for zaxpy
+    for (i = 0; i < n * incx_abs * 2; i++)
+        data_zaxpby.x_verify[i] = data_zaxpby.x_test[i];
+
+    // Copy vector y for zscal
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_zaxpby.y_verify[i] = data_zaxpby.y_test[i];
+
+    // Find beta*y
+    BLASFUNC(zscal)(&n, beta, data_zaxpby.y_verify, &incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    BLASFUNC(zaxpy)(&n, alpha, data_zaxpby.x_verify, &incx,
+                        data_zaxpby.y_verify, &incy);
+    
+    BLASFUNC(zaxpby)(&n, alpha, data_zaxpby.x_test, &incx,
+                        beta, data_zaxpby.y_test, &incy);
+
+    // Find the differences between output vector caculated by zaxpby and zaxpy
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_zaxpby.y_test[i] -= data_zaxpby.y_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dznrm2)(&n, data_zaxpby.y_test, &incy_abs);
+}
+
+/**
+ * C API specific function
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Compare with the following options:
+ * 
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param beta - scalar beta
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double c_api_check_zaxpby(blasint n, double *alpha, blasint incx, double *beta, blasint incy)
+{
+    blasint i;
+
+    // zscal accept only positive increments
+    blasint incx_abs = labs(incx);
+    blasint incy_abs = labs(incy);
+
+    // Fill vectors x, y
+    drand_generate(data_zaxpby.x_test, n * incx_abs * 2);
+    drand_generate(data_zaxpby.y_test, n * incy_abs * 2);
+
+    // Copy vector x for zaxpy
+    for (i = 0; i < n * incx_abs * 2; i++)
+        data_zaxpby.x_verify[i] = data_zaxpby.x_test[i];
+
+    // Copy vector y for zscal
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_zaxpby.y_verify[i] = data_zaxpby.y_test[i];
+
+    // Find beta*y
+    cblas_zscal(n, beta, data_zaxpby.y_verify, incy_abs);
+
+    // Find sum of alpha*x and beta*y
+    cblas_zaxpy(n, alpha, data_zaxpby.x_verify, incx,
+                        data_zaxpby.y_verify, incy);
+    
+    cblas_zaxpby(n, alpha, data_zaxpby.x_test, incx,
+                        beta, data_zaxpby.y_test, incy);
+
+    // Find the differences between output vector caculated by zaxpby and zaxpy
+    for (i = 0; i < n * incy_abs * 2; i++)
+        data_zaxpby.y_test[i] -= data_zaxpby.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_dznrm2(n, data_zaxpby.y_test, incy_abs);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(zaxpby, inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(zaxpby, inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha[] = {3.0, 1.0};
+    double beta[] = {4.0, 3.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    double alpha[] = {5.0, 2.2};
+    double beta[] = {4.0, 5.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(zaxpby, inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {6.0, 3.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(zaxpby, inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    double alpha[] = {7.0, 2.0};
+    double beta[] = {3.5, 1.3};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(zaxpby, inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(zaxpby, inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(zaxpby, inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(zaxpby, inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(zaxpby, check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zaxpby(n, alpha, incx, beta, incy);
+    
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(zaxpby, c_api_inc_x_2_inc_y_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 1;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 2.1};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, c_api_inc_x_2_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = 2;
+    double alpha[] = {3.0, 2.0};
+    double beta[] = {4.0, 3.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpby, c_api_inc_x_neg_1_inc_y_2_N_100)
+{
+    blasint n = DATASIZE, incx = -1, incy = 2;
+    double alpha[] = {5.0, 2.0};
+    double beta[] = {4.0, 3.1};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is -1
+ */
+CTEST(zaxpby, c_api_inc_x_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = 2, incy = -1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {6.0, 2.3};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is -1
+ */
+CTEST(zaxpby, c_api_inc_x_neg_2_inc_y_neg_1_N_100)
+{
+    blasint n = DATASIZE, incx = -2, incy = -1;
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {3.5, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar beta is zero
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * Scalar alpha is zero
+ * Scalar beta is zero
+ */
+CTEST(zaxpby, c_api_inc_x_1_inc_y_1_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 1;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zaxpby by comparing it with zscal and zaxpy.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * Scalar alpha is zero
+ * Scalar beta is zero
+*/
+CTEST(zaxpby, c_api_inc_x_1_inc_y_2_N_100_alpha_beta_zero)
+{
+    blasint n = DATASIZE, incx = 1, incy = 2;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if n - size of vectors x, y is zero
+ */
+CTEST(zaxpby, c_api_check_n_zero)
+{
+    blasint n = 0, incx = 1, incy = 1;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = c_api_check_zaxpby(n, alpha, incx, beta, incy);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
diff --git a/utest/test_extensions/test_zaxpyc.c b/utest/test_extensions/test_zaxpyc.c
new file mode 100644
index 000000000..7c11cd920
--- /dev/null
+++ b/utest/test_extensions/test_zaxpyc.c
@@ -0,0 +1,159 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZAXPYC {
+	double x_test[DATASIZE * INCREMENT * 2];
+	double x_verify[DATASIZE * INCREMENT * 2];
+	double y_test[DATASIZE * INCREMENT * 2];
+	double y_verify[DATASIZE * INCREMENT * 2];
+};
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZAXPYC data_zaxpyc;
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Compare with the following options:
+ *
+ * param n - number of elements in vectors x and y
+ * param alpha - scalar alpha
+ * param incx - increment for the elements of x
+ * param incy - increment for the elements of y
+ * return norm of difference
+ */
+static double check_zaxpyc(blasint n, double *alpha, blasint incx, blasint incy)
+{
+	blasint i;
+
+	drand_generate(data_zaxpyc.x_test, n * incx * 2);
+	drand_generate(data_zaxpyc.y_test, n * incy * 2);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_zaxpyc.x_verify[i] = data_zaxpyc.x_test[i];
+
+	for (i = 0; i < n * incy * 2; i++)
+		data_zaxpyc.y_verify[i] = data_zaxpyc.y_test[i];
+
+	zconjugate_vector(n, incx, data_zaxpyc.x_verify);
+
+	BLASFUNC(zaxpy)
+	(&n, alpha, data_zaxpyc.x_verify, &incx,
+	 data_zaxpyc.y_verify, &incy);
+
+	BLASFUNC(zaxpyc)
+	(&n, alpha, data_zaxpyc.x_test, &incx,
+	 data_zaxpyc.y_test, &incy);
+
+	for (i = 0; i < n * incy * 2; i++)
+		data_zaxpyc.y_verify[i] -= data_zaxpyc.y_test[i];
+
+	return BLASFUNC(dznrm2)(&n, data_zaxpyc.y_verify, &incy);
+}
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ */
+CTEST(zaxpyc, conj_strides_one)
+{
+	blasint n = DATASIZE, incx = 1, incy = 1;
+	double alpha[] = {5.0, 2.2};
+
+	double norm = check_zaxpyc(n, alpha, incx, incy);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ */
+CTEST(zaxpyc, conj_incx_one)
+{
+	blasint n = DATASIZE, incx = 1, incy = 2;
+	double alpha[] = {5.0, 2.2};
+
+	double norm = check_zaxpyc(n, alpha, incx, incy);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ */
+CTEST(zaxpyc, conj_incy_one)
+{
+	blasint n = DATASIZE, incx = 2, incy = 1;
+	double alpha[] = {5.0, 2.2};
+
+	double norm = check_zaxpyc(n, alpha, incx, incy);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zaxpyc by conjugating vector x and comparing with zaxpy.
+ * Test with the following options:
+ *
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ */
+CTEST(zaxpyc, conj_strides_two)
+{
+	blasint n = DATASIZE, incx = 2, incy = 2;
+	double alpha[] = {5.0, 2.2};
+
+	double norm = check_zaxpyc(n, alpha, incx, incy);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
diff --git a/utest/test_extensions/test_zgbmv.c b/utest/test_extensions/test_zgbmv.c
new file mode 100644
index 000000000..55473361c
--- /dev/null
+++ b/utest/test_extensions/test_zgbmv.c
@@ -0,0 +1,280 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 1
+
+struct DATA_ZGBMV {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double a_band_storage[DATASIZE * DATASIZE * 2];
+    double matrix[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * 2 * INCREMENT];
+    double c_test[DATASIZE * 2 * INCREMENT];
+    double c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX16
+
+static struct DATA_ZGBMV data_zgbmv;
+
+/** 
+ * Transform full-storage band matrix A to band-packed storage mode.
+ * 
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * output param a - buffer for holding band-packed matrix
+ * param lda - specifies the leading dimension of a
+ * param matrix - buffer holding full-storage band matrix A 
+ * param ldm - specifies the leading full-storage band matrix A
+ */
+static void transform_to_band_storage(blasint m, blasint n, blasint kl, 
+                                      blasint ku, double* a, blasint lda,
+                                      double* matrix, blasint ldm)
+{
+    blasint i, j, k;
+    for (j = 0; j < n; j++) 
+    {
+        k = 2 * (ku - j);
+        for (i = MAX(0, 2*(j - ku)); i < MIN(m, j + kl + 1) * 2; i+=2) 
+        {
+            a[(k + i) + j * lda * 2] = matrix[i + j * ldm * 2];
+            a[(k + i) + j * lda * 2 + 1] = matrix[i + j * ldm * 2 + 1];
+        }
+    }
+}
+
+/** 
+ * Generate full-storage band matrix A with kl sub-diagonals and ku super-diagonals
+ * 
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * output param band_matrix - buffer for full-storage band matrix.
+ * param matrix - buffer holding input general matrix
+ * param ldm - specifies the leading of input general matrix
+ */
+static void get_band_matrix(blasint m, blasint n, blasint kl, blasint ku, 
+                            double *band_matrix, double *matrix, blasint ldm)
+{
+    blasint i, j;
+    blasint k = 0;
+    for (i = 0; i < n; i++)
+    {
+        for (j = 0; j < m * 2; j += 2)
+        {
+            if ((blasint)(j/2) > kl + i || i > ku + (blasint)(j/2)) 
+            {
+                band_matrix[i * ldm * 2 + j] = 0.0;
+                band_matrix[i * ldm * 2 + j + 1] = 0.0;
+                continue;
+            }
+
+            band_matrix[i * ldm * 2 + j] = matrix[k++];
+            band_matrix[i * ldm * 2 + j + 1] = matrix[k++];
+        }
+    }
+}
+
+/**
+ * Comapare results computed by zgbmv and zgemv 
+ * since gbmv is gemv for band matrix
+ * 
+ * param trans specifies op(A), the transposition operation applied to A
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param kl - number of sub-diagonals of the matrix A
+ * param ku - number of super-diagonals of the matrix A
+ * param alpha - scaling factor for the matrix-vector product
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences 
+ */
+static double check_zgbmv(char trans, blasint m, blasint n, blasint kl, blasint ku,
+    double *alpha, blasint lda, blasint inc_b, double *beta, blasint inc_c)
+{
+    blasint i;
+    blasint lenb, lenc;
+    
+    if(trans == 'T' || trans == 'C' || trans == 'D' || trans == 'U'){
+        lenb = m;
+        lenc = n;
+    } else {
+        lenb = n;
+        lenc = m;
+    }
+
+    drand_generate(data_zgbmv.matrix, m * n * 2);
+    drand_generate(data_zgbmv.b_test, 2 * (1 + (lenb - 1) * inc_b));
+    drand_generate(data_zgbmv.c_test, 2 * (1 + (lenc - 1) * inc_c));
+
+    for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++)
+        data_zgbmv.c_verify[i] = data_zgbmv.c_test[i];
+
+    get_band_matrix(m, n, kl, ku, data_zgbmv.a_test, data_zgbmv.matrix, m);
+
+    transform_to_band_storage(m, n, kl, ku, data_zgbmv.a_band_storage, lda, data_zgbmv.a_test, m);
+
+    BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgbmv.a_test, &m, data_zgbmv.b_test,
+                    &inc_b, beta, data_zgbmv.c_verify, &inc_c);
+
+    BLASFUNC(zgbmv)(&trans, &m, &n, &kl, &ku, alpha, data_zgbmv.a_band_storage, &lda, data_zgbmv.b_test, 
+                    &inc_b, beta, data_zgbmv.c_test, &inc_c);
+
+    for (i = 0; i < 2 * (1 + (lenc - 1) * inc_c); i++)
+        data_zgbmv.c_verify[i] -= data_zgbmv.c_test[i];
+
+    return BLASFUNC(dznrm2)(&lenc, data_zgbmv.c_verify, &inc_c);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is D
+ */
+CTEST(zgbmv, trans_D)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'D';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is O
+ */
+CTEST(zgbmv, trans_O)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 10;
+    blasint lda = 50;
+    char trans = 'O';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is S
+ */
+CTEST(zgbmv, trans_S)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 6, ku = 9;
+    blasint lda = 50;
+    char trans = 'S';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is U
+ */
+CTEST(zgbmv, trans_U)
+{
+    blasint m = 25, n = 50;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 7, ku = 11;
+    blasint lda = kl + ku + 1;
+    char trans = 'U';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is C
+ */
+CTEST(zgbmv, trans_C)
+{
+    blasint m = 50, n = 25;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'C';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgbmv by comparing it against zgemv
+ * with param trans is R
+ */
+CTEST(zgbmv, trans_R)
+{
+    blasint m = 50, n = 100;
+    blasint inc_b = 1, inc_c = 1;
+    blasint kl = 20, ku = 11;
+    blasint lda = 50;
+    char trans = 'R';
+
+    double alpha[] = {7.0, 1.0};
+    double beta[] = {1.5, -1.5};
+
+    double norm = check_zgbmv(trans, m, n, kl, ku, alpha, lda, inc_b, beta, inc_c);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+#endif
diff --git a/utest/test_extensions/test_zgeadd.c b/utest/test_extensions/test_zgeadd.c
new file mode 100644
index 000000000..7496ccf88
--- /dev/null
+++ b/utest/test_extensions/test_zgeadd.c
@@ -0,0 +1,880 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+
+struct DATA_ZGEADD {
+    double a_test[M * N * 2];
+    double c_test[M * N * 2];
+    double c_verify[M * N * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZGEADD data_zgeadd;
+
+/**
+ * zgeadd reference implementation
+ *
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param aptr - refer to matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param cptr - refer to matrix C
+ * param ldc - leading dimension of C
+ */
+static void zgeadd_trusted(blasint m, blasint n, double *alpha, double *aptr,
+                           blasint lda, double *beta, double *cptr, blasint ldc)
+{
+    blasint i;
+
+    lda *= 2;
+    ldc *= 2;
+
+    for (i = 0; i < n; i++)
+    {
+        cblas_zaxpby(m, alpha, aptr, 1, beta, cptr, 1);
+        aptr += lda;
+        cptr += ldc;
+    }
+}
+
+/**
+ * Test zgeadd by comparing it against reference
+ * Compare with the following options:
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param alpha - scaling factor for matrix A
+ * param lda - leading dimension of A
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static double check_zgeadd(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                           blasint m, blasint n, double *alpha, blasint lda,
+                           double *beta, blasint ldc)
+{
+    blasint i;
+    blasint cols = m, rows = n;
+
+    if (order == CblasRowMajor)
+    {
+        rows = m;
+        cols = n;
+    }
+
+    // Fill matrix A, C
+    drand_generate(data_zgeadd.a_test, lda * rows * 2);
+    drand_generate(data_zgeadd.c_test, ldc * rows * 2);
+
+    // Copy matrix C for zgeadd
+    for (i = 0; i < ldc * rows * 2; i++)
+        data_zgeadd.c_verify[i] = data_zgeadd.c_test[i];
+
+    zgeadd_trusted(cols, rows, alpha, data_zgeadd.a_test, lda,
+                   beta, data_zgeadd.c_verify, ldc);
+
+    if (api == 'F')
+        BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda,
+                         beta, data_zgeadd.c_test, &ldc);
+    else
+        cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda,
+                     beta, data_zgeadd.c_test, ldc);
+
+    // Find the differences between output matrix caculated by zgeadd and sgemm
+    return dmatrix_difference(data_zgeadd.c_test, data_zgeadd.c_verify, cols, rows, ldc * 2);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param api - specifies Fortran or C API
+ * param order - specifies whether A and C stored in
+ * row-major order or column-major order
+ * param m - number of rows of A and C
+ * param n - number of columns of A and C
+ * param lda - leading dimension of A
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in zgeadd
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, OPENBLAS_CONST enum CBLAS_ORDER order,
+                         blasint m, blasint n, blasint lda,
+                         blasint ldc, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    set_xerbla("ZGEADD ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(zgeadd)(&m, &n, alpha, data_zgeadd.a_test, &lda,
+                         beta, data_zgeadd.c_test, &ldc);
+    else
+        cblas_zgeadd(order, m, n, alpha, data_zgeadd.a_test, lda,
+                     beta, data_zgeadd.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(zgeadd, matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {3.0, 2.0};
+    double beta[] = {1.0, 3.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(zgeadd, matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(zgeadd, matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {3.0, 1.5};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(zgeadd, matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(zgeadd, matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C
+ * Must be at least zero.
+ */
+CTEST(zgeadd, xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ */
+CTEST(zgeadd, xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ */
+CTEST(zgeadd, xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ */
+CTEST(zgeadd, xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('F', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Check if n - number of columns of A, C equal zero.
+ */
+CTEST(zgeadd, n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Check if m - number of rows of A and C equal zero.
+ */
+CTEST(zgeadd, m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('F', order, m, n, alpha, lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 3.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {4.0, 1.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is row-major order
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(zgeadd, c_api_matrix_n_50_m_100_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = N / 2;
+    blasint m = M;
+
+    blasint lda = n;
+    blasint ldc = n;
+
+    double alpha[] = {3.0, 2.5};
+    double beta[] = {1.0, 2.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar alpha is zero (operation is C:=beta*C)
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100_alpha_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalar beta is zero (operation is C:=alpha*A)
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {3.0, 1.5};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * c api option order is column-major order
+ * For A number of rows is 100, number of colums is 100
+ * For C number of rows is 100, number of colums is 100
+ * Scalars alpha, beta is zero (operation is C:= 0)
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_100_alpha_beta_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgeadd by comparing it against reference
+ * with the following options:
+ *
+ * For A number of rows is 50, number of colums is 100
+ * For C number of rows is 50, number of colums is 100
+ */
+CTEST(zgeadd, c_api_matrix_n_100_m_50)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = N;
+    blasint m = M / 2;
+
+    blasint lda = m;
+    blasint ldc = m;
+
+    double alpha[] = {2.0, 3.0};
+    double beta[] = {2.0, 4.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order -
+ * specifies whether A and C stored in
+ * row-major order or column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_invalid_order)
+{
+    CBLAS_ORDER order = INVALID;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 0;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_n_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n -
+ * number of columns of A and C.
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(zgeadd, c_api_xerbla_n_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = INVALID;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_m_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m -
+ * number of rows of A and C
+ * Must be at least zero.
+ *
+ * c api option order is row-major order
+ */
+CTEST(zgeadd, c_api_xerbla_m_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = INVALID;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_lda_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of A. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(zgeadd, c_api_xerbla_lda_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = INVALID;
+    blasint ldc = 1;
+
+    int expected_info = 5;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_xerbla_ldc_invalid)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param ldc -
+ * specifies the leading dimension of C. Must be at least MAX(1, m).
+ *
+ * c api option order is row-major order
+ */
+CTEST(zgeadd, c_api_xerbla_ldc_invalid_row_major)
+{
+    CBLAS_ORDER order = CblasRowMajor;
+
+    blasint n = 1;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = INVALID;
+
+    int expected_info = 8;
+
+    int passed = check_badargs('C', order, m, n, lda, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Check if n - number of columns of A, C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_n_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 0;
+    blasint m = 1;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Check if m - number of rows of A and C equal zero.
+ *
+ * c api option order is column-major order
+ */
+CTEST(zgeadd, c_api_m_zero)
+{
+    CBLAS_ORDER order = CblasColMajor;
+
+    blasint n = 1;
+    blasint m = 0;
+
+    blasint lda = 1;
+    blasint ldc = 1;
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgeadd('C', order, m, n, alpha,
+                               lda, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zgemm.c b/utest/test_extensions/test_zgemm.c
new file mode 100644
index 000000000..4160a5086
--- /dev/null
+++ b/utest/test_extensions/test_zgemm.c
@@ -0,0 +1,273 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZGEMM {
+	double a_test[DATASIZE * DATASIZE * 2];
+    double a_verify[DATASIZE * DATASIZE * 2];
+	double b_test[DATASIZE * DATASIZE * 2];
+    double b_verify[DATASIZE * DATASIZE * 2];
+    double c_test[DATASIZE * DATASIZE * 2];
+	double c_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZGEMM data_zgemm;
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices
+ * and comparing it with the non-conjugate zgemm.
+ *
+ * param transa specifies op(A), the transposition (conjugation) operation applied to A
+ * param transb specifies op(B), the transposition (conjugation) operation applied to B
+ * param m specifies the number of rows of the matrix op(A) and of the matrix C
+ * param n specifies the number of columns of the matrix op(B) and the number of columns of the matrix C
+ * param k specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of matrix A
+ * param ldb - leading dimension of matrix B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of matrix C
+ * return norm of difference
+ */
+static double check_zgemm(char transa, char transb, blasint m, blasint n, blasint k, 
+                         double *alpha, blasint lda, blasint ldb, double *beta, blasint ldc)
+{
+	blasint i;
+	double alpha_conj[] = {1.0, 0.0}; 
+	char transa_verify = transa;
+    char transb_verify = transb;
+
+    int arows = k, acols = m;
+    int brows = n, bcols = k;
+
+    if (transa == 'T' || transa == 'C'){
+        arows = m; acols = k;
+    }
+
+    if (transb == 'T' || transb == 'C'){
+        brows = k; bcols = n;
+    }
+
+	drand_generate(data_zgemm.a_test, arows * lda * 2);
+	drand_generate(data_zgemm.b_test, brows * ldb * 2);
+    drand_generate(data_zgemm.c_test, n * ldc * 2);
+
+	for (i = 0; i < arows * lda * 2; i++)
+		data_zgemm.a_verify[i] = data_zgemm.a_test[i];
+
+	for (i = 0; i < brows * ldb * 2; i++)
+		data_zgemm.b_verify[i] = data_zgemm.b_test[i];
+
+    for (i = 0; i < n * ldc * 2; i++)
+		data_zgemm.c_verify[i] = data_zgemm.c_test[i];
+
+	if (transa == 'R'){
+		cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, arows, acols, alpha_conj, data_zgemm.a_verify, lda, lda);
+		transa_verify = 'N';
+	}
+
+    if (transb == 'R'){
+		cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, brows, bcols, alpha_conj, data_zgemm.b_verify, ldb, ldb);
+	    transb_verify = 'N';
+	}
+
+	BLASFUNC(zgemm)(&transa_verify, &transb_verify, &m, &n, &k, alpha, data_zgemm.a_verify, &lda,
+	 				data_zgemm.b_verify, &ldb, beta, data_zgemm.c_verify, &ldc);
+
+	BLASFUNC(zgemm)(&transa, &transb, &m, &n, &k, alpha, data_zgemm.a_test, &lda,
+	 				data_zgemm.b_test, &ldb, beta, data_zgemm.c_test, &ldc);
+
+	return dmatrix_difference(data_zgemm.c_test, data_zgemm.c_verify, m, n, ldc*2);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(zgemm, conjtransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'C';
+	char transb = 'R';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is not conjugate and not transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(zgemm, notransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'N';
+	char transb = 'R';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is conjugate and transposed
+ */
+CTEST(zgemm, conjnotransa_conjtransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'C';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is not conjugate and not transposed
+ */
+CTEST(zgemm, conjnotransa_notransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'N';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(zgemm, conjnotransa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'R';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not transposed
+ * matrix B is transposed
+ */
+CTEST(zgemm, conjnotransa_transb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'R';
+	char transb = 'T';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test zgemm with the conjugate matrices by conjugating and not transposed matrices 
+ * and comparing it with the non-conjugate zgemm.
+ * Test with the following options:
+ *
+ * matrix A is transposed
+ * matrix B is conjugate and not transposed
+ */
+CTEST(zgemm, transa_conjnotransb)
+{
+	blasint n = DATASIZE, m = DATASIZE, k = DATASIZE;
+    blasint lda = DATASIZE, ldb = DATASIZE, ldc = DATASIZE;
+	char transa = 'T';
+	char transb = 'R';
+	double alpha[] = {-2.0, 1.0};
+    double beta[] = {1.0, -1.0};
+
+	double norm = check_zgemm(transa, transb, m, n, k, alpha, lda, ldb, beta, ldc);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zgemmt.c b/utest/test_extensions/test_zgemmt.c
new file mode 100644
index 000000000..c55381008
--- /dev/null
+++ b/utest/test_extensions/test_zgemmt.c
@@ -0,0 +1,2010 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_ZGEMMT {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * DATASIZE * 2];
+    double c_test[DATASIZE * DATASIZE * 2];
+    double c_verify[DATASIZE * DATASIZE * 2];
+    double c_gemm[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZGEMMT data_zgemmt;
+
+/**
+ * Compute gemmt via gemm since gemmt is gemm but updates only 
+ * the upper or lower triangular part of the result matrix
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ */
+static void zgemmt_trusted(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                           char transb, blasint m, blasint k, double *alpha, blasint lda, 
+                           blasint ldb, double *beta, blasint ldc)
+{
+    blasint i, j;
+
+    if(api == 'F')
+        BLASFUNC(zgemm)(&transa, &transb, &m, &m, &k, alpha, data_zgemmt.a_test, &lda,
+                        data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_gemm, &ldc);
+    else
+        cblas_zgemm(order, transa, transb, m, m, k, alpha, data_zgemmt.a_test, lda,
+                data_zgemmt.b_test, ldb, beta, data_zgemmt.c_gemm, ldc);
+
+    ldc *= 2;
+
+    if (uplo == 'L' || uplo == CblasLower)
+    {
+        for (i = 0; i < m; i++)
+            for (j = i * 2; j < m * 2; j+=2){
+                data_zgemmt.c_verify[i * ldc + j] =
+                    data_zgemmt.c_gemm[i * ldc + j];
+                data_zgemmt.c_verify[i * ldc + j + 1] =
+                    data_zgemmt.c_gemm[i * ldc + j + 1];
+            }
+    } else {
+        for (i = 0; i < m; i++)
+            for (j = 0; j <= i * 2; j+=2){
+                data_zgemmt.c_verify[i * ldc + j] =
+                    data_zgemmt.c_gemm[i * ldc + j];
+                data_zgemmt.c_verify[i * ldc + j + 1] =
+                    data_zgemmt.c_gemm[i * ldc + j + 1];
+            }
+    }
+}
+
+/**
+ * Comapare results computed by zgemmt and zgemmt_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order (for Fortran API column major always)
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param alpha - scaling factor for the matrix-matrix product
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param beta - scaling factor for matrix C
+ * param ldc - leading dimension of C
+ * return norm of differences
+ */
+static double check_zgemmt(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                          char transb, blasint m, blasint k, double *alpha, blasint lda, 
+                          blasint ldb, double *beta, blasint ldc)
+{
+    blasint i;
+    blasint b_cols;
+    blasint a_cols;
+    blasint inc = 1;
+    blasint size_c = m * ldc * 2;
+
+    if(order == CblasColMajor){
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = m;
+        else a_cols = k;
+
+        if (transb == 'T' || transb == 'C' || 
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = k;
+        else b_cols = m;
+    } else {
+        if (transa == 'T' || transa == 'C' || 
+            transa == CblasTrans || transa == CblasConjTrans) 
+            a_cols = k;
+        else a_cols = m;
+
+        if (transb == 'T' || transb == 'C' ||
+            transb == CblasTrans || transb == CblasConjTrans) 
+            b_cols = m;
+        else b_cols = k;
+    }
+
+    drand_generate(data_zgemmt.a_test, a_cols * lda * 2);
+    drand_generate(data_zgemmt.b_test, b_cols * ldb  * 2);
+    drand_generate(data_zgemmt.c_test, m * ldc * 2);
+
+    for (i = 0; i < m * ldc * 2; i++)
+        data_zgemmt.c_gemm[i] = data_zgemmt.c_verify[i] = data_zgemmt.c_test[i];
+
+    zgemmt_trusted(api, order, uplo, transa, transb, m, k, alpha, lda, ldb, beta, ldc);
+
+    if (api == 'F')
+        BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test,
+                         &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc);
+    else
+        cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda,
+                    data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc);
+
+    for (i = 0; i < m * ldc * 2; i++)
+        data_zgemmt.c_verify[i] -= data_zgemmt.c_test[i];
+
+    return BLASFUNC(dnrm2)(&size_c, data_zgemmt.c_verify, &inc) / size_c;
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether C’s data is stored in its upper or lower triangle
+ * param transa specifies op(A), the transposition operation applied to A
+ * param transb specifies op(B), the transposition operation applied to B
+ * param m - number of rows of op(A), columns of op(B), and columns and rows of C
+ * param k - number of columns of op(A) and rows of op(B)
+ * param lda - leading dimension of A
+ * param ldb - leading dimension of B
+ * param ldc - leading dimension of C
+ * param expected_info - expected invalid parameter number in zgemmt
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char api, enum CBLAS_ORDER order, char uplo, char transa, 
+                         char transb, blasint m, blasint k, blasint lda, blasint ldb,
+                         blasint ldc, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    set_xerbla("ZGEMMT ", expected_info);
+
+    if (api == 'F')
+        BLASFUNC(zgemmt)(&uplo, &transa, &transb, &m, &k, alpha, data_zgemmt.a_test,
+                         &lda, data_zgemmt.b_test, &ldb, beta, data_zgemmt.c_test, &ldc);
+    else
+        cblas_zgemmt(order, uplo, transa, transb, m, k, alpha, data_zgemmt.a_test, lda,
+                    data_zgemmt.b_test, ldb, beta, data_zgemmt.c_test, ldc);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    char transa = 'N', transb = 'T';
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'U';
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    char transa = 'R', transb = 'R';
+    char uplo = 'U';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'R';
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'R', transb = 'C';
+    char uplo = 'U';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, upper_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'C';
+    char uplo = 'U';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    char transa = 'N', transb = 'T';
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'T';
+    char uplo = 'L';
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    char transa = 'R', transb = 'R';
+    char uplo = 'L';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'R';
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'R', transb = 'C';
+    char uplo = 'L';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'C', transb = 'C';
+    char uplo = 'L';
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('F', CblasColMajor, uplo, transa, transb,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_colmajor_upper_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, c_api_colmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, c_api_colmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 25, ldb = 25, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 25, ldc = 25;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_colmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, c_api_colmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Column Major
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, c_api_colmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasColMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 25;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {-1.0, -1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 25, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_M_25_K_50_a_conjtrans_b_conjtrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 25, ldb = 50, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its upper triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, c_api_rowmajor_upper_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_notrans_b_notrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_25_a_trans_b_notrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A not transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_25_K_50_a_notrans_b_trans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 25;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A transposed
+ * B transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_trans_b_trans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.5, 0.5};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasTrans, CblasTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_25_K_50_a_conjnotrans_b_conjnotrans)
+{
+    blasint M = 25, K = 50;
+    blasint lda = 50, ldb = 25, ldc = 25;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate not transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjtrans_b_conjnotrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate not transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_50_a_conjnotrans_b_conjtrans)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjNoTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * A conjugate transposed
+ * B conjugate transposed
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_M_50_K_25_a_conjtrans_b_conjtrans)
+{
+    blasint M = 50, K = 25;
+    blasint lda = 50, ldb = 25, ldc = 50;
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.5, 2.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasConjTrans, CblasConjTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * alpha_r = 0.0, alpha_i = 0.0
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_alpha_zero)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {0.0, 0.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zgemmt by comparing it against sgemm
+ * with the following options:
+ *
+ * Row Major
+ * C’s data is stored in its lower triangle part
+ * beta_r = 1.0, beta_i = 0.0
+ */
+CTEST(zgemmt, c_api_rowmajor_lower_beta_one)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zgemmt('C', CblasRowMajor, CblasLower, CblasNoTrans, CblasNoTrans,
+                              M, K, alpha, lda, ldb, beta, ldc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param uplo.
+ * Must be upper (U) or lower (L).
+ */
+CTEST(zgemmt, xerbla_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transa.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(zgemmt, xerbla_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'O', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param transb.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(zgemmt, xerbla_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'O';
+    char uplo = 'U';
+    int expected_info = 3;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 4;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 5;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(zgemmt, xerbla_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 8;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(zgemmt, xerbla_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'N', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 10;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(zgemmt, xerbla_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    char transa = 'T', transb = 'N';
+    char uplo = 'U';
+    int expected_info = 13;
+
+    int passed = check_badargs('F', CblasColMajor, uplo, transa, transb,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test.
+ * Test error function for an invalid param order.
+ * Must be column or row major.
+ */
+CTEST(zgemmt, xerbla_c_api_major_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 0;
+
+    int passed = check_badargs('C', 'O', CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasColMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 100, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B not transposed.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Column Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(zgemmt, xerbla_c_api_colmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasColMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param uplo.
+ * Must be upper or lower.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_uplo_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 1;
+
+    int passed = check_badargs('C', CblasRowMajor, 'O', CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transa.
+ * Must be trans or no-trans.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_transa_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 2;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, 'O', CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param transb.
+ * Must be trans or no-trans.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_transb_invalid)
+{
+    blasint M = 50, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 3;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, 'O',
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param M.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_m_invalid)
+{
+    blasint M = -1, K = 50;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 4;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param K.
+ * Must be positive.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_k_invalid)
+{
+    blasint M = 50, K = -1;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 5;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param lda.
+ * Must be must be at least K if matrix A transposed.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_lda_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 8;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasNoTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldb.
+ * Must be must be at least K if matrix B transposed.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_ldb_invalid)
+{
+    blasint M = 50, K = 100;
+    blasint lda = 50, ldb = 50, ldc = 50;
+    int expected_info = 10;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test. Row Major
+ * Test error function for an invalid param ldc.
+ * Must be must be at least M.
+ */
+CTEST(zgemmt, xerbla_c_api_rowmajor_ldc_invalid)
+{
+    blasint M = 100, K = 50;
+    blasint lda = 100, ldb = 100, ldc = 50;
+    int expected_info = 13;
+
+    int passed = check_badargs('C', CblasRowMajor, CblasUpper, CblasTrans, CblasNoTrans,
+                            M, K, lda, ldb, ldc, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zgemv_n.c b/utest/test_extensions/test_zgemv_n.c
new file mode 100644
index 000000000..903b855e1
--- /dev/null
+++ b/utest/test_extensions/test_zgemv_n.c
@@ -0,0 +1,341 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZSPMV_N {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * 2 * INCREMENT];
+    double c_test[DATASIZE * 2 * INCREMENT];
+    double c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX16
+
+static struct DATA_ZSPMV_N data_zgemv_n;
+
+/**
+ * zgemv not transposed reference code
+ *
+ * param trans specifies whether matris A is conj or/and xconj
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param alpha - scaling factor for the matrib-vector product
+ * param a - buffer holding input matrib A
+ * param lda - leading dimension of matrix A
+ * param b - Buffer holding input vector b
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param c - buffer holding input/output vector c
+ * param inc_c - stride of vector c
+ */
+static void zgemv_n_trusted(char trans, blasint m, blasint n, double *alpha, double *a,
+                          blasint lda, double *b, blasint inc_b, double *beta, double *c,
+                          blasint inc_c)
+{
+	blasint i, j;
+    blasint i2 = 0;
+	blasint ib = 0, ic = 0;
+
+    double temp_r, temp_i;
+
+	double *a_ptr = a;
+    blasint lda2 = 2*lda;
+
+	blasint inc_b2 = 2 * inc_b;
+    blasint inc_c2 = 2 * inc_c;
+
+    BLASFUNC(zscal)(&m, beta, c, &inc_c);
+
+	for (j = 0; j < n; j++)
+	{
+
+        if (trans == 'N' || trans == 'R') {
+            temp_r = alpha[0] * b[ib] - alpha[1] * b[ib+1];
+            temp_i = alpha[0] * b[ib+1] + alpha[1] * b[ib];
+        } else {
+            temp_r = alpha[0] * b[ib] + alpha[1] * b[ib+1];
+            temp_i = alpha[0] * b[ib+1] - alpha[1] * b[ib];
+        }
+
+		ic = 0;
+		i2 = 0;
+
+		for (i = 0; i < m; i++)
+		{
+                if (trans == 'N') {
+                    c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
+                    c[ic+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+                } 
+                if (trans == 'O') {
+                    c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
+                    c[ic+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+                }
+                if (trans == 'R') {
+                    c[ic] += temp_r * a_ptr[i2] + temp_i * a_ptr[i2+1];
+                    c[ic+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+                }
+                if (trans == 'S') {
+                    c[ic] += temp_r * a_ptr[i2] - temp_i * a_ptr[i2+1];
+                    c[ic+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+                }
+			i2 += 2;
+			ic += inc_c2;
+		}
+		a_ptr += lda2;
+		ib += inc_b2;
+	}
+
+}
+
+/**
+ * Comapare results computed by zgemv and zgemv_n_trusted
+ *
+ * param trans specifies whether matris A is conj or/and xconj
+ * param m - number of rows of A
+ * param n - number of columns of A
+ * param alpha - scaling factor for the matrib-vector product
+ * param lda - leading dimension of matrix A
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences
+ */
+static double check_zgemv_n(char trans, blasint m, blasint n, double *alpha, blasint lda, 
+                            blasint inc_b, double *beta, blasint inc_c)
+{
+    blasint i;
+
+    drand_generate(data_zgemv_n.a_test, n * lda);
+    drand_generate(data_zgemv_n.b_test, 2 * n * inc_b);
+    drand_generate(data_zgemv_n.c_test, 2 * m * inc_c);
+
+    for (i = 0; i < m * 2 * inc_c; i++)
+        data_zgemv_n.c_verify[i] = data_zgemv_n.c_test[i];
+
+    zgemv_n_trusted(trans, m, n, alpha, data_zgemv_n.a_test, lda, data_zgemv_n.b_test, 
+                  inc_b, beta, data_zgemv_n.c_test, inc_c);
+    BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_n.a_test, &lda, data_zgemv_n.b_test, 
+                    &inc_b, beta, data_zgemv_n.c_verify, &inc_c);
+
+    for (i = 0; i < m * 2 * inc_c; i++)
+        data_zgemv_n.c_verify[i] -= data_zgemv_n.c_test[i];
+
+    return BLASFUNC(dznrm2)(&n, data_zgemv_n.c_verify, &inc_c);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_o_square_matrix)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows of A is 50
+ * Number of colums of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_o_rectangular_matrix_rows_less_then_cols)
+{
+    blasint n = 100, m = 50, lda = 50;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows of A is 100
+ * Number of colums of A is 50
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_o_rectangular_matrix_cols_less_then_rows)
+{
+    blasint n = 50, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'O';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(zgemv, trans_o_double_strides)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 2, inc_c = 2;
+    char trans = 'O';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_s_square_matrix)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows of A is 50
+ * Number of colums of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_s_rectangular_matrix_rows_less_then_cols)
+{
+    blasint n = 100, m = 50, lda = 50;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows of A is 100
+ * Number of colums of A is 50
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zgemv, trans_s_rectangular_matrix_cols_less_then_rows)
+{
+    blasint n = 50, m = 100, lda = 100;
+    blasint inc_b = 1, inc_c = 1;
+    char trans = 'S';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.4, 0.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * A is xconj and conj
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(zgemv, trans_s_double_strides)
+{
+    blasint n = 100, m = 100, lda = 100;
+    blasint inc_b = 2, inc_c = 2;
+    char trans = 'S';
+    double alpha[] = {2.0, -1.0};
+    double beta[] = {1.0, 5.0};
+
+    double norm = check_zgemv_n(trans, m, n, alpha, lda, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+#endif
diff --git a/utest/test_extensions/test_zgemv_t.c b/utest/test_extensions/test_zgemv_t.c
new file mode 100644
index 000000000..b2d0b2713
--- /dev/null
+++ b/utest/test_extensions/test_zgemv_t.c
@@ -0,0 +1,1136 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define N 100
+#define M 100
+#define INCREMENT 2
+
+struct DATA_ZGEMV_T {
+    double a_test[N * M * 2];
+    double a_verify[N * M * 2];
+    double y_test[M * INCREMENT * 2];
+    double y_verify[M * INCREMENT * 2];
+    double x_test[M * INCREMENT * 2];
+    double x_verify[M * INCREMENT * 2];
+};
+
+// DOUBLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * DBL_EPSILON
+// DOUBLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 2.2e-16 = 1e-11
+#define DOUBLE_EPS_ZGEMV 1e-11
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZGEMV_T data_zgemv_t;
+
+/**
+ * Find product of matrix-vector multiplication
+ *
+ * param n specifies number of columns of A
+ * param m specifies number of rows of A and size of vector x
+ * param lda specifies leading dimension of A
+ * param inc_x specifies increment of vector x
+ */
+static void matrix_vector_product(blasint n, blasint m, blasint lda, blasint inc_x)
+{
+    blasint i;
+    double *a_ptr = data_zgemv_t.a_verify;
+    double *x_ptr = data_zgemv_t.x_test;
+    double *x_res = data_zgemv_t.x_verify;
+
+    openblas_complex_double result;
+
+    for (i = 0; i < n * inc_x; i += inc_x)
+    {
+        result = cblas_zdotu(lda, a_ptr, 1, x_ptr, inc_x);
+        x_res[0] = CREAL(result);
+        x_res[1] = CIMAG(result);
+        a_ptr += lda * 2;
+        x_res += 2 * inc_x;
+    }
+}
+
+/**
+ * Test zgemv by comparing it against zomatcopy, zaxpby and
+ * reference func matrix_vector_product
+ *
+ * zomatcopy perform operation: op(A)
+ * matrix_vector_product perform operation: A*x
+ * zaxpby perform operation: alpha*x + beta*y
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param alpha specifies scalar alpha
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param beta specifies scalar beta
+ * param inc_y specifies increment for vector y
+ * return norm of difference between zgemv and result of reference funcs
+ */
+static double check_zgemv(char api, char order, char trans, blasint m, blasint n, double *alpha,
+                          blasint lda, blasint inc_x, double *beta, blasint inc_y)
+{
+    blasint i;
+
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+
+    // Transpose parameters for zomatcopy
+    // zgemv_t perform operation on transposed matrix, no need to transpose a_verify
+    char trans_copy;
+    char ctrans_copy;
+
+    // Param alpha for zomatcopy, scale on alpha perform zaxpby
+    double alpha_one[] = {1.0, 0.0};
+
+    memset(data_zgemv_t.x_verify, 0.0, m * inc_x * 2 * sizeof(double));
+
+    // Fill matrix A, vectors x, y
+    drand_generate(data_zgemv_t.a_test, lda * n * 2);
+    drand_generate(data_zgemv_t.x_test, m * inc_x * 2);
+    drand_generate(data_zgemv_t.y_test, m * inc_y * 2);
+
+    // Copy vector y for reference funcs
+    for (i = 0; i < m * inc_y * 2; i++)
+    {
+        data_zgemv_t.y_verify[i] = data_zgemv_t.y_test[i];
+    }
+
+    if (api == 'F') {
+        if (trans == 'T') trans_copy = 'N';
+        if (trans == 'C') trans_copy = 'R';
+        if (trans == 'U') trans_copy = 'R';
+        if (trans == 'D') trans_copy = 'N';
+
+        // Perform operation: op(A)
+        BLASFUNC(zomatcopy)(&order, &trans_copy, &m, &n, alpha_one, 
+                            data_zgemv_t.a_test, &lda, data_zgemv_t.a_verify, &lda);
+
+        // Find A*x
+        matrix_vector_product(n, m, lda, inc_x);
+
+        // Find conj(x)
+        if (trans == 'U' || trans == 'D')
+        {
+            zconjugate_vector(m, inc_x, data_zgemv_t.x_verify);
+        }
+
+        // Find alpha*x+beta*y
+        BLASFUNC(zaxpby)(&n, alpha, data_zgemv_t.x_verify, &inc_x, beta, 
+                         data_zgemv_t.y_verify, &inc_y);
+
+        BLASFUNC(zgemv)(&trans, &m, &n, alpha, data_zgemv_t.a_test, &lda, 
+                        data_zgemv_t.x_test, &inc_x, beta, data_zgemv_t.y_test, &inc_y);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') {ctrans = CblasTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasTrans : CblasNoTrans;}
+        if (trans == 'N') {ctrans = CblasNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasNoTrans : CblasTrans;}
+        if (trans == 'C') {ctrans = CblasConjTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjTrans : CblasConjNoTrans;}
+        if (trans == 'R') {ctrans = CblasConjNoTrans; ctrans_copy = (corder == CblasRowMajor) ? CblasConjNoTrans : CblasConjTrans;}
+
+        // Perform operation: op(A)
+        cblas_zomatcopy(corder, ctrans_copy, m, n, alpha_one, data_zgemv_t.a_test, lda, data_zgemv_t.a_verify, lda);
+
+        // Find A*x
+        matrix_vector_product(n, m, lda, inc_x);
+
+        // Find alpha*x+beta*y
+        cblas_zaxpby(n, alpha, data_zgemv_t.x_verify, inc_x, beta, data_zgemv_t.y_verify, inc_y);
+
+        cblas_zgemv(corder, ctrans, m, n, alpha, data_zgemv_t.a_test,
+                    lda, data_zgemv_t.x_test, inc_x, beta, data_zgemv_t.y_test, inc_y);
+    }
+
+    // Find the differences between output vector caculated by zgemv and reference funcs
+    for (i = 0; i < m * inc_y * 2; i++)
+        data_zgemv_t.y_test[i] -= data_zgemv_t.y_verify[i];
+
+    // Find the norm of differences
+    return cblas_dznrm2(m, data_zgemv_t.y_test, inc_y);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param inc_y specifies increment for vector y
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint m, blasint n,
+                         blasint lda, blasint inc_x, blasint inc_y, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double a[] = {1.0, 1.0};
+    double x[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+    double y[] = {1.0, 1.0};
+
+    set_xerbla("ZGEMV ", expected_info);
+
+    BLASFUNC(zgemv)(&trans, &m, &n, alpha, a, &lda, x, 
+                    &inc_x, beta, y, &inc_y);
+
+    return check_error();
+}
+
+/**
+ * C API specific function
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param m specifies number of rows of A
+ * param n specifies number of columns of A
+ * param lda specifies leading dimension of the matrix A
+ * param inc_x specifies increment for vector x
+ * param inc_y specifies increment for vector y
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int c_api_check_badargs(CBLAS_ORDER corder, CBLAS_TRANSPOSE ctrans, blasint m, blasint n,
+                               blasint lda, blasint inc_x, blasint inc_y, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double a[] = {1.0, 1.0};
+    double x[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+    double y[] = {1.0, 1.0};
+
+    set_xerbla("ZGEMV ", expected_info);
+
+    cblas_zgemv(corder, ctrans, m, n, alpha, a, lda, x, inc_x, beta, y, inc_y);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 2.0
+ */
+CTEST(zgemv, colmajor_trans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 2.0
+ */
+CTEST(zgemv, colmajor_trans_col_100_row_100_inc_x_2_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {2.0, 2.0};
+
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 1.0
+ */
+CTEST(zgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 2
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 1.0
+ */
+CTEST(zgemv, colmajor_conjtrans_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and x conjugate
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 2.0, beta_i = 1.0
+ */
+CTEST(zgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and x conjugate
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 1.0, alpha_i = 2.0
+ * beta_r = 1.0, beta_i = 1.0
+ */
+CTEST(zgemv, colmajor_trans_x_conj_col_100_row_100_inc_x_2_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'U';
+
+    double alpha[] = {1.0, 2.0};
+    double beta[] = {1.0, 1.0};
+
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition, conjugate A, conjugate x
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 2.0
+ */
+CTEST(zgemv, colmajor_conjtrans_x_conj_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'D';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('F', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition, conjugate A, conjugate x
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 2.0
+ */
+CTEST(zgemv, c_api_colmajor_trans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'T';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 2.0
+ */
+CTEST(zgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate A
+ * Square matrix
+ * inc x = 1, inc y = 2
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 2.0
+ */
+CTEST(zgemv, c_api_colmajor_conjtrans_col_100_row_100_inc_x_1_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'C';
+    char trans = 'C';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 2.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 2.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 1.0
+ */
+CTEST(zgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'N';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * No trans
+ * Square matrix
+ * inc x = 2, inc y = 2
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 3.0, beta_i = 2.0
+ */
+CTEST(zgemv, c_api_rowmajor_notrans_col_100_row_100_inc_x_2_y_2)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'N';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {3.0, 1.0};
+
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Conjugate
+ * Square matrix
+ * inc x = 1, inc y = 1
+ * alpha_r = 1.0, alpha_i = 3.0
+ * beta_r = 1.0, beta_i = 2.5
+ */
+CTEST(zgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_1_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'R';
+
+    double alpha[] = {1.0, 3.0};
+    double beta[] = {1.0, 2.5};
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * C API specific test
+ * Test zgemv by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Conjugate
+ * Square matrix
+ * inc x = 2, inc y = 1
+ * alpha_r = 1.0, alpha_i = 1.0
+ * beta_r = 1.0, beta_i = 1.5
+ */
+CTEST(zgemv, c_api_rowmajor_conj_col_100_row_100_inc_x_2_y_1)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100;
+    char order = 'R';
+    char trans = 'R';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.5};
+
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    double norm = check_zgemv('C', order, trans, m, n, alpha, lda,
+                              inc_x, beta, inc_y);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_inc_y)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_inc_y_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_y.
+ * Must be positive
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_inc_y_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 0;
+
+    int expected_info = 11;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_inc_x)
+{
+    char order = 'C';
+    char trans = 'T';
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+
+    int expected_info = 8;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_inc_x_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+
+    int expected_info = 8;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param inc_x.
+ * Must be positive
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_inc_x_row_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 0;
+    blasint inc_y = 1;
+
+    int expected_info = 8;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_n)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_n_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param n.
+ * Must be positive.
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_n_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 3;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_m)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_m_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = INVALID, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param m.
+ * Must be positive.
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_m_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = INVALID;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 2;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param lda.
+ * lda must be at least n.
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_lda)
+{
+    char order = 'C';
+    char trans = 'T';
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda.
+ * If matrices are stored using col major layout,
+ * lda must be at least m.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_lda_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param lda.
+ * If matrices are stored using col major layout,
+ * lda must be at least n.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_lda_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = CblasNoTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = INVALID;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 6;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Fortran API specific test
+ * Test error function for an invalid param trans.
+ *
+ * Column major
+ */
+CTEST(zgemv, xerbla_invalid_trans)
+{
+    char order = 'C';
+    char trans = 'Z';
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param trans.
+ *
+ * Column major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_trans_col_major)
+{
+    enum CBLAS_ORDER corder = CblasColMajor;
+    enum CBLAS_TRANSPOSE ctrans = INVALID;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param trans.
+ *
+ * Row major
+ */
+CTEST(zgemv, c_api_xerbla_invalid_trans_row_major)
+{
+    enum CBLAS_ORDER corder = CblasRowMajor;
+    enum CBLAS_TRANSPOSE ctrans = INVALID;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 1;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * C API specific test
+ * Test error function for an invalid param order.
+ */
+CTEST(zgemv, c_api_xerbla_invalid_order_col_major)
+{
+    enum CBLAS_ORDER corder = INVALID;
+    enum CBLAS_TRANSPOSE ctrans = CblasTrans;
+
+    blasint m = 1, n = 1;
+    blasint lda = 1;
+
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    int expected_info = 0;
+
+    int passed = c_api_check_badargs(corder, ctrans, m, n, lda, inc_x, inc_y, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_zimatcopy.c b/utest/test_extensions/test_zimatcopy.c
new file mode 100644
index 000000000..8376bc493
--- /dev/null
+++ b/utest/test_extensions/test_zimatcopy.c
@@ -0,0 +1,818 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_ZIMATCOPY {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double a_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZIMATCOPY data_zimatcopy;
+
+/**
+ * Comapare results computed by zimatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param alpha specifies scaling factor for matrix A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * return norm of difference between openblas and reference func
+ */
+static double check_zimatcopy(char api, char order, char trans, blasint rows, blasint cols, double *alpha, 
+                             blasint lda_src, blasint lda_dst)
+{
+    blasint m, n;
+    blasint rows_out, cols_out;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+    int conj = -1;
+
+    if (order == 'C') {
+        n = rows; m = cols;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        rows_out = n; cols_out = m*2;
+        if (trans == 'C')
+            conj = 1;
+    }
+    else {
+        rows_out = m; cols_out = n*2;
+        if (trans == 'R')
+            conj = 1;
+    }
+
+    drand_generate(data_zimatcopy.a_test, lda_src*m*2);
+
+    if (trans == 'T' || trans == 'C') {
+        ztranspose(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj);
+    } 
+    else {
+        zcopy(m, n, alpha, data_zimatcopy.a_test, lda_src, data_zimatcopy.a_verify, lda_dst, conj);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, 
+                            &lda_src, &lda_dst);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_zimatcopy(corder, ctrans, rows, cols, alpha, data_zimatcopy.a_test, 
+                    lda_src, lda_dst);
+    }
+
+    // Find the differences between output matrix computed by zimatcopy and reference func
+    return dmatrix_difference(data_zimatcopy.a_test, data_zimatcopy.a_verify, cols_out, rows_out, lda_dst*2);    
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation 
+ * applied to the matrix A
+ * param rows specifies number of rows of A
+ * param cols specifies number of columns of A
+ * param lda_src - leading dimension of the matrix A
+ * param lda_dst - leading dimension of output matrix A
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda_src, blasint lda_dst, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+
+    set_xerbla("ZIMATCOPY", expected_info);
+
+    BLASFUNC(zimatcopy)(&order, &trans, &rows, &cols, alpha, data_zimatcopy.a_test, 
+                        &lda_src, &lda_dst);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha_r = -3.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {-3.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, colmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_notrans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, colmajor_conjtrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {1.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, colmajor_conj_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 2.0, alpha_i = 3.0
+ */
+CTEST(zimatcopy, rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha[] = {2.0, 3.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Rectangular matrix
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha[] = {2.0, 1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Rectangular matrix
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, rowmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zimatcopy, rowmajor_conj_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 50;
+    char order = 'R';
+    char trans = 'R'; 
+    double alpha[] = {1.5, -1.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, rowmajor_conjtrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('F', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, c_api_colmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {3.0, 2.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 1.5
+ */
+CTEST(zimatcopy, c_api_colmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {3.0, 1.5};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * Square matrix
+ * alpha_r = 3.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, c_api_rowmajor_trans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {3.0, 1.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, c_api_colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, c_api_colmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * Square matrix
+ * alpha_r = 1.0, alpha_i = 1.0
+ */
+CTEST(zimatcopy, c_api_rowmajor_notrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'N';
+    double alpha[] = {1.0, 1.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zimatcopy, c_api_rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    double alpha[] = {1.5, -1.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zimatcopy by comparing it against reference
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zimatcopy, c_api_rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {1.0, 2.0};
+
+    double norm = check_zimatcopy('C', order, trans, m, n, alpha, lda_src, lda_dst);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test error function for an invalid param order.
+ * Must be column (C) or row major (R).
+ */
+CTEST(zimatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param trans.
+ * Must be trans (T/C) or no-trans (N/R).
+ */
+CTEST(zimatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda_src = 100, lda_dst = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using row major layout, 
+ * lda_src must be at least n.
+ */
+CTEST(zimatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_src.
+ * If matrices are stored using column major layout,
+ * lda_src must be at least m.
+ */
+CTEST(zimatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 50, lda_dst = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is no transposition, lda_dst must be at least n.
+ */
+CTEST(zimatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using row major layout and 
+ * there is transposition, lda_dst must be at least m.
+ */
+CTEST(zimatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is no transposition, lda_dst must be at least m.
+ */
+CTEST(zimatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda_dst.
+ * If matrices are stored using column major layout and 
+ * there is transposition, lda_dst must be at least n.
+ */
+CTEST(zimatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda_src = 100, lda_dst = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda_src, lda_dst, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zomatcopy.c b/utest/test_extensions/test_zomatcopy.c
new file mode 100644
index 000000000..495831c56
--- /dev/null
+++ b/utest/test_extensions/test_zomatcopy.c
@@ -0,0 +1,713 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+
+struct DATA_ZOMATCOPY {
+    double a_test[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * DATASIZE * 2];
+    double b_verify[DATASIZE * DATASIZE * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZOMATCOPY data_zomatcopy;
+
+/**
+ * Comapare results computed by zomatcopy and reference func
+ *
+ * param api specifies tested api (C or Fortran)
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param alpha - scaling factor for matrix B
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * return norm of difference between openblas and reference func
+ */
+static double check_zomatcopy(char api, char order, char trans, blasint rows, blasint cols, double* alpha, 
+                             blasint lda, blasint ldb)
+{
+    blasint b_rows, b_cols;
+    blasint m, n;
+    enum CBLAS_ORDER corder;
+    enum CBLAS_TRANSPOSE ctrans;
+    int conj = -1;
+
+    if (order == 'C') {
+        m = cols; n = rows;
+    }
+    else {
+        m = rows; n = cols;
+    }
+
+    if(trans == 'T' || trans == 'C') {
+        b_rows = n; b_cols = m*2;
+        if (trans == 'C')
+            conj = 1;
+    }
+    else {
+        b_rows = m; b_cols = n*2;
+        if (trans == 'R')
+            conj = 1;
+    }
+
+    drand_generate(data_zomatcopy.a_test, lda*m*2);
+
+    if (trans == 'T' || trans == 'C') {
+        ztranspose(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj);
+    } 
+    else {
+        zcopy(m, n, alpha, data_zomatcopy.a_test, lda, data_zomatcopy.b_verify, ldb, conj);
+    }
+
+    if (api == 'F') {
+        BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, 
+                            &lda, data_zomatcopy.b_test, &ldb);
+    }
+    else {
+        if (order == 'C') corder = CblasColMajor;
+        if (order == 'R') corder = CblasRowMajor;
+        if (trans == 'T') ctrans = CblasTrans;
+        if (trans == 'N') ctrans = CblasNoTrans;
+        if (trans == 'C') ctrans = CblasConjTrans;
+        if (trans == 'R') ctrans = CblasConjNoTrans;
+        cblas_zomatcopy(corder, ctrans, rows, cols, alpha, data_zomatcopy.a_test, 
+                    lda, data_zomatcopy.b_test, ldb);
+    }
+    
+    return dmatrix_difference(data_zomatcopy.b_test, data_zomatcopy.b_verify, b_cols, b_rows, ldb*2);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param order specifies row or column major order
+ * param trans specifies op(A), the transposition operation
+ * applied to the matrix A
+ * param rows - number of rows of A
+ * param cols - number of columns of A
+ * param lda - leading dimension of the matrix A
+ * param ldb - leading dimension of the matrix B
+ * param expected_info - expected invalid parameter number
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char order, char trans, blasint rows, blasint cols,
+                          blasint lda, blasint ldb, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+
+    set_xerbla("ZOMATCOPY", expected_info);
+
+    BLASFUNC(zomatcopy)(&order, &trans, &rows, &cols, alpha, data_zomatcopy.a_test, 
+                        &lda, data_zomatcopy.b_test, &ldb);
+
+    return check_error();
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * alpha_r = -1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {-1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zomatcopy, colmajor_conjtrnas_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('F', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy only
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, c_api_colmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'N';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition
+ * alpha_r = -1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, c_api_colmajor_trans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    double alpha[] = {-1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Copy and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, c_api_colmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'R';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Column Major
+ * Transposition and conjugate
+ * alpha_r = 2.0, alpha_i = 1.0
+ */
+CTEST(zomatcopy, c_api_colmajor_conjtrnas_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'C';
+    double alpha[] = {2.0, 1.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy only
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, c_api_rowmajor_notrans_col_50_row_100)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 50;
+    char order = 'R';
+    char trans = 'N'; 
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, c_api_rowmajor_trans_col_100_row_50)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Copy and conjugate
+ * alpha_r = 1.5, alpha_i = -1.0
+ */
+CTEST(zomatcopy, c_api_rowmajor_conj_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'R'; 
+    double alpha[] = {1.5, -1.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zomatcopy by comparing it against refernce
+ * with the following options:
+ *
+ * Row Major
+ * Transposition and conjugate
+ * alpha_r = 1.0, alpha_i = 2.0
+ */
+CTEST(zomatcopy, c_api_rowmajor_conjtrans_col_100_row_100)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'R';
+    char trans = 'C';
+    double alpha[] = {1.0, 2.0};
+    double norm;
+
+    norm = check_zomatcopy('C', order, trans, m, n, alpha, lda, ldb);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+* Test error function for an invalid param order.
+* Must be column (C) or row major (R).
+*/
+CTEST(zomatcopy, xerbla_invalid_order)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'O';
+    char trans = 'T';
+    int expected_info = 1;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param trans.
+* Must be trans (T/C) or no-trans (N/R).
+*/
+CTEST(zomatcopy, xerbla_invalid_trans)
+{
+    blasint m = 100, n = 100;
+    blasint lda = 100, ldb = 100;
+    char order = 'C';
+    char trans = 'O';
+    int expected_info = 2;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param lda.
+* If matrices are stored using row major layout, 
+* lda must be at least n.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_invalid_lda)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 50, ldb = 100;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param lda.
+* If matrices are stored using column major layout,
+* lda must be at least m.
+*/
+CTEST(zomatcopy, xerbla_colmajor_invalid_lda)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 50, ldb = 100;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 7;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using row major layout and 
+* there is no transposition, ldb must be at least n.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_notrans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using row major layout and 
+* there is transposition, ldb must be at least m.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_trans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using row major layout and 
+* there is no transposition, ldb must be at least n.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_conj_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'R';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using row major layout and 
+* there is transposition, ldb must be at least m.
+*/
+CTEST(zomatcopy, xerbla_rowmajor_transconj_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'R';
+    char trans = 'C';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using column major layout and 
+* there is no transposition, ldb must be at least m.
+*/
+CTEST(zomatcopy, xerbla_colmajor_notrans_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'N';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using column major layout and 
+* there is transposition, ldb must be at least n.
+*/
+CTEST(zomatcopy, xerbla_colmajor_trans_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'T';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using column major layout and 
+* there is no transposition, ldb must be at least m.
+*/
+CTEST(zomatcopy, xerbla_colmajor_conj_invalid_ldb)
+{
+    blasint m = 100, n = 50;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'R';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+* Test error function for an invalid param ldb.
+* If matrices are stored using column major layout and 
+* there is transposition, ldb must be at least n.
+*/
+CTEST(zomatcopy, xerbla_colmajor_transconj_invalid_ldb)
+{
+    blasint m = 50, n = 100;
+    blasint lda = 100, ldb = 50;
+    char order = 'C';
+    char trans = 'C';
+    int expected_info = 9;
+
+    int passed = check_badargs(order, trans, m, n, lda, ldb, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_zrot.c b/utest/test_extensions/test_zrot.c
new file mode 100644
index 000000000..5471e051a
--- /dev/null
+++ b/utest/test_extensions/test_zrot.c
@@ -0,0 +1,790 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZROT {
+    double x_test[DATASIZE * INCREMENT * 2];
+    double y_test[DATASIZE * INCREMENT * 2];
+    double x_verify[DATASIZE * INCREMENT * 2];
+    double y_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZROT data_zrot;
+
+/**
+ * Comapare results computed by zdrot and zaxpby 
+ * 
+ * param n specifies size of vector x
+ * param inc_x specifies increment of vector x
+ * param inc_y specifies increment of vector y
+ * param c specifies cosine
+ * param s specifies sine
+ * return norm of differences 
+ */
+static double check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, double *s)
+{
+    blasint i;
+    double norm = 0;
+    double s_neg[] = {-s[0], s[1]};
+
+    blasint inc_x_abs = labs(inc_x);
+    blasint inc_y_abs = labs(inc_y);
+
+    // Fill vectors x, y
+    drand_generate(data_zrot.x_test, n * inc_x_abs * 2);
+    drand_generate(data_zrot.y_test, n * inc_y_abs * 2);
+
+    if (inc_x == 0 && inc_y == 0) {
+        drand_generate(data_zrot.x_test, n * 2);
+        drand_generate(data_zrot.y_test, n * 2);
+    }
+
+    // Copy vector x for zaxpby
+    for (i = 0; i < n * inc_x_abs * 2; i++)
+        data_zrot.x_verify[i] = data_zrot.x_test[i];
+
+    // Copy vector y for zaxpby
+    for (i = 0; i < n * inc_y_abs * 2; i++)
+        data_zrot.y_verify[i] = data_zrot.y_test[i];
+    
+    // Find cx = c*x + s*y
+    BLASFUNC(zaxpby)(&n, s, data_zrot.y_test, &inc_y, c, data_zrot.x_verify, &inc_x);
+
+    // Find cy = -conjg(s)*x + c*y
+    BLASFUNC(zaxpby)(&n, s_neg, data_zrot.x_test, &inc_x, c, data_zrot.y_verify, &inc_y);
+
+    BLASFUNC(zdrot)(&n, data_zrot.x_test, &inc_x, data_zrot.y_test, &inc_y, c, s);
+
+    // Find the differences between vector x caculated by zaxpby and zdrot
+    for (i = 0; i < n * 2 * inc_x_abs; i++)
+        data_zrot.x_test[i] -= data_zrot.x_verify[i];
+
+    // Find the differences between vector y caculated by zaxpby and zdrot
+    for (i = 0; i < n * 2 * inc_y_abs; i++)
+        data_zrot.y_test[i] -= data_zrot.y_verify[i];
+
+    // Find the norm of differences
+    norm += BLASFUNC(dznrm2)(&n, data_zrot.x_test, &inc_x_abs);
+    norm += BLASFUNC(dznrm2)(&n, data_zrot.y_test, &inc_y_abs);
+    return (norm / 2);
+}
+
+/**
+ * C API specific function
+ * Comapare results computed by zdrot and zaxpby 
+ * 
+ * param n specifies size of vector x
+ * param inc_x specifies increment of vector x
+ * param inc_y specifies increment of vector y
+ * param c specifies cosine
+ * param s specifies sine
+ * return norm of differences 
+ */
+static double c_api_check_zdrot(blasint n, blasint inc_x, blasint inc_y, double *c, double *s)
+{
+    blasint i;
+    double norm = 0;
+    double s_neg[] = {-s[0], s[1]};
+
+    blasint inc_x_abs = labs(inc_x);
+    blasint inc_y_abs = labs(inc_y);
+
+    // Fill vectors x, y
+    drand_generate(data_zrot.x_test, n * inc_x_abs * 2);
+    drand_generate(data_zrot.y_test, n * inc_y_abs * 2);
+
+    if (inc_x == 0 && inc_y == 0) {
+        drand_generate(data_zrot.x_test, n * 2);
+        drand_generate(data_zrot.y_test, n * 2);
+    }
+
+    // Copy vector x for zaxpby
+    for (i = 0; i < n * inc_x_abs * 2; i++)
+        data_zrot.x_verify[i] = data_zrot.x_test[i];
+
+    // Copy vector y for zaxpby
+    for (i = 0; i < n * inc_y_abs * 2; i++)
+        data_zrot.y_verify[i] = data_zrot.y_test[i];
+    
+    // Find cx = c*x + s*y
+    cblas_zaxpby(n, s, data_zrot.y_test, inc_y, c, data_zrot.x_verify, inc_x);
+
+    // Find cy = -conjg(s)*x + c*y
+    cblas_zaxpby(n, s_neg, data_zrot.x_test, inc_x, c, data_zrot.y_verify, inc_y);
+
+    cblas_zdrot(n, data_zrot.x_test, inc_x, data_zrot.y_test, inc_y, c[0], s[0]);
+
+    // Find the differences between vector x caculated by zaxpby and zdrot
+    for (i = 0; i < n * 2 * inc_x_abs; i++)
+        data_zrot.x_test[i] -= data_zrot.x_verify[i];
+
+    // Find the differences between vector y caculated by zaxpby and zdrot
+    for (i = 0; i < n * 2 * inc_y_abs; i++)
+        data_zrot.y_test[i] -= data_zrot.y_verify[i];
+
+    // Find the norm of differences
+    norm += cblas_dznrm2(n, data_zrot.x_test, inc_x_abs);
+    norm += cblas_dznrm2(n, data_zrot.y_test, inc_y_abs);
+    return (norm / 2);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 0
+ * Stride of vector y is 0
+ * c = 1.0
+ * s = 2.0
+ */
+CTEST(zrot, inc_x_0_inc_y_0)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 0;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_1_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is -1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_neg_1_inc_y_neg_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -1;
+    blasint inc_y = -1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * c = 3.0
+ * s = 2.0
+ */
+CTEST(zrot, inc_x_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {3.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_neg_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_1_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is -2
+ * c = 2.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_1_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = -2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {2.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 2.0
+ */
+CTEST(zrot, inc_x_2_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_neg_2_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = -2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 0.0
+ * s = 1.0
+ */
+CTEST(zrot, inc_x_2_inc_y_2_c_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {0.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 0.0
+ */
+CTEST(zrot, inc_x_2_inc_y_2_s_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {0.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 0
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, check_n_zero)
+{
+    blasint n = 0;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test 
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 0
+ * Stride of vector y is 0
+ * c = 1.0
+ * s = 2.0
+ */
+CTEST(zrot, c_api_inc_x_0_inc_y_0)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 0;
+    blasint inc_y = 0;
+
+    // Imaginary  part for zaxpby
+    double c[] = {3.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_1_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -1
+ * Stride of vector y is -1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_neg_1_inc_y_neg_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -1;
+    blasint inc_y = -1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 1
+ * c = 3.0
+ * s = 2.0
+ */
+CTEST(zrot, c_api_inc_x_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {3.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is -2
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_neg_2_inc_y_1)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_1_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 1
+ * Stride of vector y is -2
+ * c = 2.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_1_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 1;
+    blasint inc_y = -2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {2.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 2.0
+ */
+CTEST(zrot, c_api_inc_x_2_inc_y_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {2.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_neg_2_inc_y_neg_2)
+{
+    blasint n = 100;
+    
+    blasint inc_x = -2;
+    blasint inc_y = -2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 0.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_inc_x_2_inc_y_2_c_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {0.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 100
+ * Stride of vector x is 2
+ * Stride of vector y is 2
+ * c = 1.0
+ * s = 0.0
+ */
+CTEST(zrot, c_api_inc_x_2_inc_y_2_s_zero)
+{
+    blasint n = 100;
+    
+    blasint inc_x = 2;
+    blasint inc_y = 2;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {0.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrot by comparing it with zaxpby.
+ * Test with the following options:
+ * 
+ * Size of vectors x, y is 0
+ * Stride of vector x is 1
+ * Stride of vector y is 1
+ * c = 1.0
+ * s = 1.0
+ */
+CTEST(zrot, c_api_check_n_zero)
+{
+    blasint n = 0;
+    
+    blasint inc_x = 1;
+    blasint inc_y = 1;
+
+    // Imaginary  part for zaxpby
+    double c[] = {1.0, 0.0};
+    double s[] = {1.0, 0.0};
+
+    double norm = c_api_check_zdrot(n, inc_x, inc_y, c, s);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zrotg.c b/utest/test_extensions/test_zrotg.c
new file mode 100644
index 000000000..1de95447d
--- /dev/null
+++ b/utest/test_extensions/test_zrotg.c
@@ -0,0 +1,290 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+
+#ifdef BUILD_COMPLEX16
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, zero_a)
+{
+    double sa[2] = {0.0, 0.0};
+    double sb[2] = {1.0, 1.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70710678118655, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.70710678118655, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421356237310, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific tests
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, zero_b)
+{
+    double sa[2] = {1.0, 1.0};
+    double sb[2] = {0.0, 0.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(1.0, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, zero_real)
+{
+    double sa[2] = {0.0, 1.0};
+    double sb[2] = {0.0, 1.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.70710678118654, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70710678118654, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421356237309, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, positive_real_positive_img)
+{
+    double sa[2] = {3.0, 4.0};
+    double sb[2] = {4.0, 6.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, negative_real_positive_img)
+{
+    double sa[2] = {-3.0, 4.0};
+    double sb[2] = {-4.0, 6.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, positive_real_negative_img)
+{
+    double sa[2] = {3.0, -4.0};
+    double sb[2] = {4.0, -6.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, negative_real_negative_img)
+{
+    double sa[2] = {-3.0, -4.0};
+    double sb[2] = {-4.0, -6.0};
+    double ss[2];
+    double sc;
+    BLASFUNC(zrotg)(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_zero_a)
+{
+    double sa[2] = {0.0, 0.0};
+    double sb[2] = {1.0, 1.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.0, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70710678118655, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.70710678118655, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421356237310, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_zero_b)
+{
+    double sa[2] = {1.0, 1.0};
+    double sb[2] = {0.0, 0.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(1.0, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.0, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_zero_real)
+{
+    double sa[2] = {0.0, 1.0};
+    double sb[2] = {0.0, 1.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.70710678118654, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.70710678118654, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.0, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(1.41421356237309, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_positive_real_positive_img)
+{
+    double sa[2] = {3.0, 4.0};
+    double sb[2] = {4.0, 6.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_negative_real_positive_img)
+{
+    double sa[2] = {-3.0, 4.0};
+    double sb[2] = {-4.0, 6.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_positive_real_negative_img)
+{
+    double sa[2] = {3.0, -4.0};
+    double sb[2] = {4.0, -6.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zrotg by comparing it against pre-calculated values
+ */
+CTEST(zrotg, c_api_negative_real_negative_img)
+{
+    double sa[2] = {-3.0, -4.0};
+    double sb[2] = {-4.0, -6.0};
+    double ss[2];
+    double sc;
+    cblas_zrotg(sa, sb, &sc, ss);
+    ASSERT_DBL_NEAR_TOL(0.56980288229818, sc, DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(0.82051615050939, ss[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-0.04558423058385, ss[1], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-5.26497863243527, sa[0], DOUBLE_EPS);
+    ASSERT_DBL_NEAR_TOL(-7.01997150991369, sa[1], DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zsbmv.c b/utest/test_extensions/test_zsbmv.c
new file mode 100644
index 000000000..0e79dc0d8
--- /dev/null
+++ b/utest/test_extensions/test_zsbmv.c
@@ -0,0 +1,607 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZSBMV {
+    double sp_matrix[DATASIZE * (DATASIZE + 1)];
+    double sb_matrix[DATASIZE * DATASIZE * 2];
+    double b_test[DATASIZE * 2 * INCREMENT];
+    double c_test[DATASIZE * 2 * INCREMENT];
+    double c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+// DOUBLE_EPS_ZGEMV = MAX_VAL * NUMBER OF OPERATIONS * DBL_EPSILON
+// DOUBLE_EPS_ZGEMV = 5.0 * O(100 * 100) * 2.2e-16 = 1e-11
+#define DOUBLE_EPS_ZGEMV 1e-11
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZSBMV data_zsbmv;
+
+/** 
+ * Transform full-storage symmetric band matrix A to upper (U) or lower (L)
+ * band-packed storage mode.
+ * 
+ * param uplo specifies whether matrix a is upper or lower band-packed.
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * output param a - buffer for holding symmetric band-packed matrix
+ * param lda - specifies the leading dimension of a
+ * param sb_matrix - buffer holding full-storage symmetric band matrix A 
+ * param ldm - specifies the leading dimension of A
+ */
+static void transform_to_band_storage(char uplo, blasint n, blasint k, double* a, blasint lda,
+                                     double* sb_matrix, blasint ldm) 
+{
+    blasint i, j, m;
+    if (uplo == 'L') {
+        for (j = 0; j < n; j++)
+        {
+            m = -j;
+            for (i = 2 * j; i < MIN(2 * n, 2 * (j + k + 1)); i += 2)
+            {
+                a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2];
+                a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2];
+            }
+        }
+    }
+    else {
+        for (j = 0; j < n; j++)
+        {   
+            m = k - j;
+            for (i = MAX(0, 2*(j - k)); i <= j*2; i += 2)
+            {
+                a[(2*m + i) + j * lda * 2] = sb_matrix[i + j * ldm * 2];
+                a[(2*m + (i + 1)) + j * lda * 2] = sb_matrix[(i + 1) + j * ldm * 2];
+            }
+        }
+    }
+}
+
+/** 
+ * Generate full-storage symmetric band matrix A with k - super-diagonals
+ * from input symmetric packed matrix in lower packed mode (L)
+ * 
+ * output param sb_matrix - buffer for holding full-storage symmetric band matrix.
+ * param sp_matrix - buffer holding input symmetric packed matrix
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ */
+static void get_symmetric_band_matr(double *sb_matrix, double *sp_matrix, blasint n, blasint k)
+{
+    blasint m;
+    blasint i, j;
+    m = 0;
+    for (i = 0; i < n; i++)
+    {
+        for (j = 0; j < n * 2; j += 2)
+        {
+            // Make matrix band with k super-diagonals
+            if (fabs((i+1) - ceil((j+1)/2.0)) > k) 
+            {
+                sb_matrix[i * n * 2 + j] = 0.0;
+                sb_matrix[i * n * 2 + j + 1] = 0.0;
+                continue;
+            }
+
+            if (j / 2 < i)
+            {
+                sb_matrix[i * n * 2 + j] = 
+                        sb_matrix[j * n + i * 2];
+                sb_matrix[i * n * 2 + j + 1] = 
+                        sb_matrix[j * n + i * 2 + 1];
+            }
+            else
+            {
+                sb_matrix[i * n * 2 + j] = sp_matrix[m++];
+                sb_matrix[i * n * 2 + j + 1] = sp_matrix[m++];
+            }
+        }
+    }
+}
+
+/** 
+ * Check if error function was called with expected function name
+ * and param info
+ * 
+ * param uplo specifies whether matrix a is upper or lower band-packed.
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b_test
+ * param inc_c - stride of vector c_test
+ * param expected_info - expected invalid parameter number in zsbmv
+ * return TRUE if everything is ok, otherwise FALSE 
+ */
+static int check_badargs(char uplo, blasint n, blasint k, blasint lda, blasint inc_b,
+                          blasint inc_c, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double a[2];
+    drand_generate(a, 2);
+
+    set_xerbla("ZSBMV ", expected_info);
+
+    BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda, data_zsbmv.b_test, 
+                    &inc_b, beta, data_zsbmv.c_test, &inc_c);
+
+    return check_error();
+}
+
+/**
+ * Comapare results computed by zsbmv and zgemv 
+ * since zsbmv is zgemv for symmetric band matrix
+ * 
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param k - number of super-diagonals of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param lda - specifies the leading dimension of a
+ * param inc_b - stride of vector b_test
+ * param beta - scaling factor for vector c_test
+ * param inc_c - stride of vector c_test
+ * param lda - specifies the leading dimension of a
+ * return norm of differences 
+ */
+static double check_zsbmv(char uplo, blasint n, blasint k, double *alpha, blasint lda, 
+    blasint inc_b, double *beta, blasint inc_c, blasint ldm)
+{
+    blasint i;
+
+    // Trans param for gemv (can use any, since the input matrix is symmetric)
+    char trans = 'N';
+
+    // Symmetric band packed matrix for sbmv
+    double *a = (double*) malloc(lda * n * 2 * sizeof(double));
+
+    // Fill symmetric packed matrix sp_matrix, vector b_test, vector c_test 
+    drand_generate(data_zsbmv.sp_matrix, n * (n + 1));
+    drand_generate(data_zsbmv.b_test, n * inc_b * 2);
+    drand_generate(data_zsbmv.c_test, n * inc_c * 2);
+
+    // Copy vector c_test for zgemv
+    for (i = 0; i < n * inc_c * 2; i++)
+        data_zsbmv.c_verify[i] = data_zsbmv.c_test[i];
+
+    // Generate full-storage symmetric band matrix
+    // with k super-diagonals from symmetric packed matrix
+    get_symmetric_band_matr(data_zsbmv.sb_matrix, data_zsbmv.sp_matrix, n, k);
+
+    // Transform symmetric band matrix from conventional
+    // full matrix storage  to band storage for zsbmv
+    transform_to_band_storage(uplo, n, k, a, lda, data_zsbmv.sb_matrix, ldm);
+
+    BLASFUNC(zgemv)(&trans, &n, &n, alpha, data_zsbmv.sb_matrix, &ldm, data_zsbmv.b_test,
+                    &inc_b, beta, data_zsbmv.c_verify, &inc_c);
+
+    BLASFUNC(zsbmv)(&uplo, &n, &k, alpha, a, &lda,
+                    data_zsbmv.b_test, &inc_b, beta, data_zsbmv.c_test, &inc_c);
+
+    free(a);
+    // Find the differences between output vector caculated by zsbmv and zgemv
+    for (i = 0; i < n * inc_c * 2; i++)
+        data_zsbmv.c_test[i] -= data_zsbmv.c_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dznrm2)(&n, data_zsbmv.c_test, &inc_c);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 0
+ */
+CTEST(zsbmv, upper_k_0_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 1
+ */
+CTEST(zsbmv, upper_k_1_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 1;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, upper_k_2_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, upper_k_2_inc_b_2_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is upper-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 2
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, upper_k_2_inc_b_2_inc_c_2_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 2;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 0
+ */
+CTEST(zsbmv, lower_k_0_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 1
+ */
+CTEST(zsbmv, lower_k_1_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 1;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 1
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, lower_k_2_inc_b_1_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 1
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, lower_k_2_inc_b_2_inc_c_1_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 1;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test zsbmv by comparing it against zgemv
+ * with the following options:
+ * 
+ * a is lower-band-packed symmetric matrix
+ * Number of rows and columns of A is 100
+ * Stride of vector b_test is 2
+ * Stride of vector c_test is 2
+ * Number of super-diagonals k is 2
+ */
+CTEST(zsbmv, lower_k_2_inc_b_2_inc_c_2_n_100)
+{
+    blasint n = DATASIZE, inc_b = 2, inc_c = 2;
+    blasint k = 2;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'L';
+
+    double alpha[] = {2.0, 1.0};
+    double beta[] = {2.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/** 
+ * Check if output matrix a contains any NaNs
+ */
+CTEST(zsbmv, check_for_NaN)
+{
+    blasint n = DATASIZE, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = n;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {1.0, 1.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    
+    ASSERT_TRUE(norm == norm); /* NaN == NaN is false  */
+}
+
+/**
+ * Test error function for an invalid param uplo.
+ * Uplo specifies whether a is in upper (U) or lower (L) band-packed storage mode.
+ */
+CTEST(zsbmv, xerbla_uplo_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'O';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 1;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/** 
+ * Test error function for an invalid param N -
+ * number of rows and columns of A. Must be at least zero.
+ */
+CTEST(zsbmv, xerbla_n_invalid)
+{
+    blasint n = INVALID, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 2;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Check if n - number of rows and columns of A equal zero.
+ */
+CTEST(zsbmv, check_n_zero)
+{
+    blasint n = 0, inc_b = 1, inc_c = 1;
+    blasint k = 0;
+    blasint lda = k + 1;
+    blasint ldm = 1;
+    char uplo = 'U';
+
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zsbmv(uplo, n, k, alpha, lda, inc_b, beta, inc_c, ldm);
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS_ZGEMV);
+}
+
+/**
+ * Test error function for an invalid param inc_b -
+ * stride of vector b_test. Can't be zero. 
+ */
+CTEST(zsbmv, xerbla_inc_b_zero)
+{
+    blasint n = 1, inc_b = 0, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 8;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_c -
+ * stride of vector c_test. Can't be zero. 
+ */
+CTEST(zsbmv, xerbla_inc_c_zero)
+{
+    blasint n = 1, inc_b = 1, inc_c = 0;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = k + 1;
+    int expected_info = 11;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param k -
+ * number of super-diagonals of A. Must be at least zero.
+ */
+CTEST(zsbmv, xerbla_k_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = INVALID;
+    blasint lda = 1;
+    int expected_info = 3;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param lda -
+ * specifies the leading dimension of a. Must be at least (k+1).
+ */
+CTEST(zsbmv, xerbla_lda_invalid)
+{
+    blasint n = 1, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    blasint k = 0;
+    blasint lda = INVALID;
+    int expected_info = 6;
+
+    int passed = check_badargs(uplo, n, k, lda, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_zscal.c b/utest/test_extensions/test_zscal.c
new file mode 100644
index 000000000..132f4ee5b
--- /dev/null
+++ b/utest/test_extensions/test_zscal.c
@@ -0,0 +1,165 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZSCAL {
+    double x_test[DATASIZE * 2 * INCREMENT];
+    double x_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZSCAL data_zscal;
+
+
+/**
+ * zscal reference code
+ *
+ * param n - number of elements of vector x
+ * param alpha - scaling factor for the vector product
+ * param x - buffer holding input vector x
+ * param inc - stride of vector x
+ */
+static void zscal_trusted(blasint n, double *alpha, double* x, blasint inc){
+    blasint i, ip = 0;
+    blasint inc_x2 = 2 * inc;
+    double temp;
+    for (i = 0; i < n; i++)
+	{
+        temp = alpha[0] * x[ip] - alpha[1] * x[ip+1];
+		x[ip+1] = alpha[0] * x[ip+1] + alpha[1] * x[ip];
+        x[ip] = temp;
+        ip += inc_x2;
+    }
+}
+
+/**
+ * Comapare results computed by zscal and zscal_trusted
+ *
+ * param api specifies tested api (C or Fortran)
+ * param n - number of elements of vector x
+ * param alpha - scaling factor for the vector product
+ * param inc - stride of vector x
+ * return norm of differences
+ */
+static double check_zscal(char api, blasint n, double *alpha, blasint inc)
+{
+    blasint i;
+
+    // Fill vectors x
+    drand_generate(data_zscal.x_test, n * inc * 2);
+
+    // Copy vector x for zscal_trusted
+    for (i = 0; i < n * 2 * inc; i++)
+        data_zscal.x_verify[i] = data_zscal.x_test[i];
+
+    zscal_trusted(n, alpha, data_zscal.x_verify, inc);
+
+    if(api == 'F')
+        BLASFUNC(zscal)(&n, alpha, data_zscal.x_test, &inc);
+    else
+        cblas_zscal(n, alpha, data_zscal.x_test, inc);
+
+    // Find the differences between output vector computed by zscal and zscal_trusted
+    for (i = 0; i < n * 2 * inc; i++)
+        data_zscal.x_verify[i] -= data_zscal.x_test[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dznrm2)(&n, data_zscal.x_verify, &inc);
+}
+
+/**
+ * Fortran API specific test
+ * Test zscal by comparing it against reference
+ */
+CTEST(zscal, alpha_r_zero_alpha_i_not_zero)
+{
+    blasint N = DATASIZE;
+    blasint inc = 1;
+    double alpha[2] = {0.0, 1.0};
+
+    double norm = check_zscal('F', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Fortran API specific test
+ * Test zscal by comparing it against reference
+ */
+CTEST(zscal, alpha_r_zero_alpha_i_zero_inc_2)
+{
+    blasint N = DATASIZE;
+    blasint inc = 2;
+    double alpha[2] = {0.0, 0.0};
+
+    double norm = check_zscal('F', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zscal by comparing it against reference
+ */
+CTEST(zscal, c_api_alpha_r_zero_alpha_i_not_zero)
+{
+    blasint N = DATASIZE;
+    blasint inc = 1;
+    double alpha[2] = {0.0, 1.0};
+
+    double norm = check_zscal('C', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * C API specific test
+ * Test zscal by comparing it against reference
+ */
+CTEST(zscal, c_api_alpha_r_zero_alpha_i_zero_inc_2)
+{
+    blasint N = DATASIZE;
+    blasint inc = 2;
+    double alpha[2] = {0.0, 0.0};
+
+    double norm = check_zscal('C', N, alpha, inc);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_zspmv.c b/utest/test_extensions/test_zspmv.c
new file mode 100644
index 000000000..510ac0579
--- /dev/null
+++ b/utest/test_extensions/test_zspmv.c
@@ -0,0 +1,427 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include "common.h"
+
+#define DATASIZE 100
+#define INCREMENT 2
+
+struct DATA_ZSPMV {
+    double a_verify[DATASIZE * DATASIZE * 2];
+    double a_test[DATASIZE * (DATASIZE + 1)];
+    double b_test[DATASIZE * 2 * INCREMENT];
+    double c_test[DATASIZE * 2 * INCREMENT];
+    double c_verify[DATASIZE * 2 * INCREMENT];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZSPMV data_zspmv;
+
+/**
+ * Compute spmv via gemv since spmv is gemv for symmetric packed matrix
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param a - buffer holding input matrix A
+ * param b - Buffer holding input vector b
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param c - buffer holding input/output vector c
+ * param inc_c - stride of vector c
+ * output param data_zspmv.c_verify - matrix computed by gemv
+ */
+static void zspmv_trusted(char uplo, blasint n, double *alpha, double *a,
+                          double *b, blasint inc_b, double *beta, double *c,
+                          blasint inc_c)
+{
+    blasint k;
+    blasint i, j;
+
+    // param for gemv (can use any, since the input matrix is symmetric)
+    char trans = 'N';
+    
+    // Unpack the input symmetric packed matrix
+    if (uplo == 'L')
+    {
+        k = 0;
+        for (i = 0; i < n; i++)
+        {
+            for (j = 0; j < n * 2; j += 2)
+            {
+                if (j / 2 < i)
+                {
+                    data_zspmv.a_verify[i * n * 2 + j] = 
+                            data_zspmv.a_verify[j * n + i * 2];
+                    data_zspmv.a_verify[i * n * 2 + j + 1] = 
+                            data_zspmv.a_verify[j * n + i * 2 + 1];
+                }
+                else
+                {
+                    data_zspmv.a_verify[i * n * 2 + j] = a[k++];
+                    data_zspmv.a_verify[i * n * 2 + j + 1] = a[k++];
+                }
+            }
+        }
+    }
+    else
+    {
+        k = n * (n + 1) - 1;
+        for (j = 2 * n - 1; j >= 0; j -= 2)
+        {
+            for (i = n - 1; i >= 0; i--)
+            {
+                if (j / 2 < i)
+                {
+                    data_zspmv.a_verify[i * n * 2 + j] = 
+                            data_zspmv.a_verify[(j - 1) * n + i * 2 + 1];
+                    data_zspmv.a_verify[i * n * 2 + j - 1] = 
+                            data_zspmv.a_verify[(j - 1) * n + i * 2];
+                }
+                else
+                {
+                    data_zspmv.a_verify[i * n * 2 + j] = a[k--];
+                    data_zspmv.a_verify[i * n * 2 + j - 1] = a[k--];
+                }
+            }
+        }
+    }
+
+    // Run gemv with unpacked matrix
+    BLASFUNC(zgemv)(&trans, &n, &n, alpha, data_zspmv.a_verify, &n, b, 
+                    &inc_b, beta, c, &inc_c);
+}
+
+/**
+ * Comapare results computed by zspmv and zspmv_trusted
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param alpha - scaling factor for the matrix-vector product
+ * param inc_b - stride of vector b
+ * param beta - scaling factor for vector c
+ * param inc_c - stride of vector c
+ * return norm of differences
+ */
+static double check_zspmv(char uplo, blasint n, double *alpha, blasint inc_b,
+                          double *beta, blasint inc_c)
+{
+    blasint i;
+
+    // Fill symmetric packed maxtix a, vectors b and c 
+    drand_generate(data_zspmv.a_test, n * (n + 1));
+    drand_generate(data_zspmv.b_test, 2 * n * inc_b);
+    drand_generate(data_zspmv.c_test, 2 * n * inc_c);
+
+    // Copy vector c for zspmv_trusted
+    for (i = 0; i < n * 2 * inc_c; i++)
+        data_zspmv.c_verify[i] = data_zspmv.c_test[i];
+
+    zspmv_trusted(uplo, n, alpha, data_zspmv.a_test, data_zspmv.b_test, 
+                  inc_b, beta, data_zspmv.c_verify, inc_c);
+    BLASFUNC(zspmv)(&uplo, &n, alpha, data_zspmv.a_test, data_zspmv.b_test, 
+                    &inc_b, beta, data_zspmv.c_test, &inc_c);
+
+    // Find the differences between output vector caculated by zspmv and zspmv_trusted
+    for (i = 0; i < n * 2 * inc_c; i++)
+        data_zspmv.c_test[i] -= data_zspmv.c_verify[i];
+
+    // Find the norm of differences
+    return BLASFUNC(dznrm2)(&n, data_zspmv.c_test, &inc_c);
+}
+
+/**
+ * Check if error function was called with expected function name
+ * and param info
+ *
+ * param uplo specifies whether matrix A is upper or lower triangular
+ * param n - number of rows and columns of A
+ * param inc_b - stride of vector b
+ * param inc_c - stride of vector c
+ * param expected_info - expected invalid parameter number in zspmv
+ * return TRUE if everything is ok, otherwise FALSE
+ */
+static int check_badargs(char uplo, blasint n, blasint inc_b,
+                          blasint inc_c, int expected_info)
+{
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    set_xerbla("ZSPMV ", expected_info);
+
+    BLASFUNC(zspmv)(&uplo, &n, alpha, data_zspmv.a_test, data_zspmv.b_test, 
+                    &inc_b, beta, data_zspmv.c_test, &inc_c);
+
+    return check_error();
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zspmv, upper_inc_b_1_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 2
+ */
+CTEST(zspmv, upper_inc_b_1_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 2;
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 1
+ */
+CTEST(zspmv, upper_inc_b_2_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 1;
+    char uplo = 'U';
+    double alpha[] = {1.0, 0.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is upper triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(zspmv, upper_inc_b_2_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 2;
+    char uplo = 'U';
+    double alpha[] = {2.5, -2.1};
+    double beta[] = {0.0, 1.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 1
+ */
+CTEST(zspmv, lower_inc_b_1_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 1
+ * Stride of vector c is 2
+ */
+CTEST(zspmv, lower_inc_b_1_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 2;
+    char uplo = 'L';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 1
+ */
+CTEST(zspmv, lower_inc_b_2_inc_c_1_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 1;
+    char uplo = 'L';
+    double alpha[] = {1.0, 0.0};
+    double beta[] = {1.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Test zspmv by comparing it against zgemv
+ * with the following options:
+ *
+ * A is lower triangular
+ * Number of rows and columns of A is 100
+ * Stride of vector b is 2
+ * Stride of vector c is 2
+ */
+CTEST(zspmv, lower_inc_b_2_inc_c_2_N_100)
+{
+    blasint N = DATASIZE, inc_b = 2, inc_c = 2;
+    char uplo = 'L';
+    double alpha[] = {2.5, -2.1};
+    double beta[] = {0.0, 1.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_TOL);
+}
+
+/**
+ * Check if output matrix A contains any NaNs
+ */
+CTEST(zspmv, check_for_NaN)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    double alpha[] = {1.0, 1.0};
+    double beta[] = {0.0, 0.0};
+
+    double norm = check_zspmv(uplo, N, alpha, inc_b, beta, inc_c);
+
+    ASSERT_TRUE(norm == norm); /* NaN == NaN is false */
+}
+
+/**
+ * Test error function for an invalid param uplo.
+ * uplo specifies whether A is upper or lower triangular.
+ */
+CTEST(zspmv, xerbla_uplo_invalid)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 1;
+    char uplo = 'O';
+    int expected_info = 1;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param N -
+ * number of rows and columns of A. Must be at least zero.
+ */
+CTEST(zspmv, xerbla_N_invalid)
+{
+    blasint N = INVALID, inc_b = 1, inc_c = 1;
+    char uplo = 'U';
+    int expected_info = 2;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_b -
+ * stride of vector b. Can't be zero.
+ */
+CTEST(zspmv, xerbla_inc_b_zero)
+{
+    blasint N = DATASIZE, inc_b = 0, inc_c = 1;
+    char uplo = 'U';
+    int expected_info = 6;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+
+/**
+ * Test error function for an invalid param inc_c -
+ * stride of vector c. Can't be zero.
+ */
+CTEST(zspmv, xerbla_inc_c_zero)
+{
+    blasint N = DATASIZE, inc_b = 1, inc_c = 0;
+    char uplo = 'U';
+    int expected_info = 9;
+
+    int passed = check_badargs(uplo, N, inc_b, inc_c, expected_info);
+    ASSERT_EQUAL(TRUE, passed);
+}
+#endif
diff --git a/utest/test_extensions/test_ztrmv.c b/utest/test_extensions/test_ztrmv.c
new file mode 100644
index 000000000..5668ec296
--- /dev/null
+++ b/utest/test_extensions/test_ztrmv.c
@@ -0,0 +1,266 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 300
+#define INCREMENT 2
+
+struct DATA_ZTRMV {
+	double a_test[DATASIZE * DATASIZE * 2];
+	double a_verify[DATASIZE * DATASIZE * 2];
+	double x_test[DATASIZE * INCREMENT * 2];
+	double x_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZTRMV data_ztrmv;
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A
+ * and comparing it with the non-conjugate ztrmv.
+ *
+ * param uplo specifies whether A is upper or lower triangular
+ * param trans specifies op(A), the transposition (conjugation) operation applied to A
+ * param diag specifies whether the matrix A is unit triangular or not.
+ * param n - numbers of rows and columns of A
+ * param lda - leading dimension of matrix A
+ * param incx - increment for the elements of x
+ * return norm of difference
+ */
+static double check_ztrmv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx)
+{
+	blasint i;
+	double alpha_conj[] = {1.0, 0.0}; 
+	char trans_verify = trans;
+
+	drand_generate(data_ztrmv.a_test, n * lda * 2);
+	drand_generate(data_ztrmv.x_test, n * incx * 2);
+
+	for (i = 0; i < n * lda * 2; i++)
+		data_ztrmv.a_verify[i] = data_ztrmv.a_test[i];
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ztrmv.x_verify[i] = data_ztrmv.x_test[i];
+
+	if (trans == 'R'){
+		cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, alpha_conj, data_ztrmv.a_verify, lda, lda);
+		trans_verify = 'N';
+	}
+
+	BLASFUNC(ztrmv)(&uplo, &trans_verify, &diag, &n, data_ztrmv.a_verify, &lda,
+	 				data_ztrmv.x_verify, &incx);
+
+	BLASFUNC(ztrmv)(&uplo, &trans, &diag, &n, data_ztrmv.a_test, &lda,
+	 				data_ztrmv.x_test, &incx);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ztrmv.x_verify[i] -= data_ztrmv.x_test[i];
+
+	return BLASFUNC(dznrm2)(&n, data_ztrmv.x_verify, &incx);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ztrmv, conj_notrans_upper_not_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ */
+CTEST(ztrmv, conj_notrans_upper_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ztrmv, conj_notrans_lower_not_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ */
+CTEST(ztrmv, conj_notrans_lower_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrmv, conj_notrans_upper_not_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrmv, conj_notrans_upper_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrmv, conj_notrans_lower_not_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrmv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrmv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrmv, conj_notrans_lower_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrmv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/test_ztrsv.c b/utest/test_extensions/test_ztrsv.c
new file mode 100644
index 000000000..4b7ec6aaf
--- /dev/null
+++ b/utest/test_extensions/test_ztrsv.c
@@ -0,0 +1,267 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "utest/openblas_utest.h"
+#include <cblas.h>
+#include "common.h"
+
+#define DATASIZE 300
+#define INCREMENT 2
+
+struct DATA_ZTRSV {
+	double a_test[DATASIZE * DATASIZE * 2];
+	double a_verify[DATASIZE * DATASIZE * 2];
+	double x_test[DATASIZE * INCREMENT * 2];
+	double x_verify[DATASIZE * INCREMENT * 2];
+};
+
+#ifdef BUILD_COMPLEX16
+static struct DATA_ZTRSV data_ztrsv;
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A
+ * and comparing it with the non-conjugate ztrsv.
+ *
+ * param uplo specifies whether A is upper or lower triangular
+ * param trans specifies op(A), the transposition (conjugation) operation applied to A
+ * param diag specifies whether the matrix A is unit triangular or not.
+ * param n - numbers of rows and columns of A
+ * param lda - leading dimension of matrix A
+ * param incx - increment for the elements of x
+ * return norm of difference
+ */
+static double check_ztrsv(char uplo, char trans, char diag, blasint n, blasint lda, blasint incx)
+{
+	blasint i;
+	double alpha_conj[] = {1.0, 0.0}; 
+	char trans_verify = trans;
+
+	drand_generate(data_ztrsv.a_test, n * lda * 2);
+	drand_generate(data_ztrsv.x_test, n * incx * 2);
+
+	for (i = 0; i < n * lda * 2; i++)
+		data_ztrsv.a_verify[i] = data_ztrsv.a_test[i];
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ztrsv.x_verify[i] = data_ztrsv.x_test[i];
+
+	if (trans == 'R'){
+		cblas_zimatcopy(CblasColMajor, CblasConjNoTrans, n, n, 
+						alpha_conj, data_ztrsv.a_verify, lda, lda);
+		trans_verify = 'N';
+	}
+
+	BLASFUNC(ztrsv)(&uplo, &trans_verify, &diag, &n, data_ztrsv.a_verify, 
+					&lda, data_ztrsv.x_verify, &incx);
+
+	BLASFUNC(ztrsv)(&uplo, &trans, &diag, &n, data_ztrsv.a_test, &lda, 
+					data_ztrsv.x_test, &incx);
+
+	for (i = 0; i < n * incx * 2; i++)
+		data_ztrsv.x_verify[i] -= data_ztrsv.x_test[i];
+
+	return BLASFUNC(dznrm2)(&n, data_ztrsv.x_verify, &incx);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ztrsv, conj_notrans_upper_not_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ */
+CTEST(ztrsv, conj_notrans_upper_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ */
+CTEST(ztrsv, conj_notrans_lower_not_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ */
+CTEST(ztrsv, conj_notrans_lower_unit_triangular)
+{
+	blasint n = DATASIZE, incx = 1, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrsv, conj_notrans_upper_not_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is upper triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrsv, conj_notrans_upper_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'U';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is not unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrsv, conj_notrans_lower_not_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'N';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+
+/**
+ * Test ztrsv with the conjugate and not-transposed matrix A by conjugating matrix A 
+ * and comparing it with the non-conjugate ztrsv.
+ * Test with the following options:
+ *
+ * matrix A is conjugate and not-trans
+ * matrix A is lower triangular
+ * matrix A is unit triangular
+ * vector x stride is 2
+ */
+CTEST(ztrsv, conj_notrans_lower_unit_triangular_incx_2)
+{
+	blasint n = DATASIZE, incx = 2, lda = DATASIZE;
+	char uplo = 'L';
+	char diag = 'U';
+	char trans = 'R';
+
+	double norm = check_ztrsv(uplo, trans, diag, n, lda, incx);
+
+	ASSERT_DBL_NEAR_TOL(0.0, norm, DOUBLE_EPS);
+}
+#endif
\ No newline at end of file
diff --git a/utest/test_extensions/xerbla.c b/utest/test_extensions/xerbla.c
new file mode 100644
index 000000000..9487b20a6
--- /dev/null
+++ b/utest/test_extensions/xerbla.c
@@ -0,0 +1,88 @@
+/*****************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include "common.h"
+
+static int link_xerbla=TRUE;
+static int lerr, _info, ok;
+static char *rout;
+
+static void F77_xerbla(char *srname, void *vinfo)
+{
+   int info=*(int*)vinfo;
+
+   if (link_xerbla)
+   {
+      link_xerbla = 0;
+      return;
+   }
+
+   if (rout != NULL && strcmp(rout, srname) != 0){
+      printf("***** XERBLA WAS CALLED WITH SRNAME = <%s> INSTEAD OF <%s> *******\n", srname, rout);
+      ok = FALSE;
+   }
+
+   if (info != _info){
+      printf("***** XERBLA WAS CALLED WITH INFO = %d INSTEAD OF %d in %s *******\n",info, _info, srname);
+      lerr = TRUE;
+      ok = FALSE;
+   } else lerr = FALSE;
+}
+
+/**  
+* error function redefinition 
+*/
+int BLASFUNC(xerbla)(char *name, blasint *info, blasint length)
+{
+  F77_xerbla(name, info);
+  return 0;
+}
+
+int check_error(void) {
+   if (lerr == TRUE ) {
+       printf("***** ILLEGAL VALUE OF PARAMETER NUMBER %d NOT DETECTED BY %s *****\n", _info, rout);
+      ok = FALSE;
+   }
+   lerr = TRUE;
+   return ok;
+}
+
+void set_xerbla(char* current_rout, int expected_info){
+   if (link_xerbla) /* call these first to link */
+      F77_xerbla(rout, &_info);
+
+   ok = TRUE;
+   lerr = TRUE;
+   _info = expected_info;
+   rout = current_rout;
+}
\ No newline at end of file
diff --git a/utest/test_zscal.c b/utest/test_zscal.c
new file mode 100644
index 000000000..ffc851e8b
--- /dev/null
+++ b/utest/test_zscal.c
@@ -0,0 +1,104 @@
+#include "openblas_utest.h"
+#include <cblas.h>
+#ifdef BUILD_COMPLEX16
+
+#ifndef NAN
+#define NAN 0.0/0.0
+#endif
+#ifndef INFINITY
+#define INFINITY 1.0/0.0
+#endif
+
+CTEST(zscal, i_nan)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, i, &nan, 1);
+    ASSERT_TRUE(isnan(nan[0]));
+    ASSERT_TRUE(isnan(nan[1]));
+    ASSERT_TRUE(isnan(nan[16]));
+    ASSERT_TRUE(isnan(nan[17]));
+}
+
+CTEST(zscal, i_nan_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0,
+                    NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, i, &nan, 2);
+    ASSERT_TRUE(isnan(nan[0]));
+    ASSERT_TRUE(isnan(nan[1]));
+    ASSERT_TRUE(isnan(nan[16]));
+    ASSERT_TRUE(isnan(nan[17]));
+}
+
+CTEST(zscal, nan_i)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, &nan, &i, 1);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isnan(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isnan(i[17]));
+}
+
+CTEST(zscal, nan_i_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
+                  0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, &nan, &i, 2);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isnan(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isnan(i[17]));
+}
+
+CTEST(zscal, i_inf)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, i, &inf, 1);
+    ASSERT_TRUE(isnan(inf[0]));
+    ASSERT_TRUE(isinf(inf[1]));
+    ASSERT_TRUE(isnan(inf[16]));
+    ASSERT_TRUE(isinf(inf[17]));
+}
+
+CTEST(zscal, i_inf_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0,
+                    INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, i, &inf, 2);
+    ASSERT_TRUE(isnan(inf[0]));
+    ASSERT_TRUE(isinf(inf[1]));
+    ASSERT_TRUE(isnan(inf[16]));
+    ASSERT_TRUE(isinf(inf[17]));
+}
+
+CTEST(zscal, inf_i)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, &inf, &i, 1);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isinf(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isinf(i[17]));
+}
+
+CTEST(zscal, inf_i_inc_2)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1,
+                  0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, &inf, &i, 2);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isinf(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isinf(i[17]));
+}
+
+#endif
diff --git a/utest/utest_main2.c b/utest/utest_main2.c
index 4382bf159..46a3b067d 100644
--- a/utest/utest_main2.c
+++ b/utest/utest_main2.c
@@ -38,6 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CTEST_SEGFAULT
 #define CTEST_ADD_TESTS_MANUALLY
 
+#include "cblas.h"
 #include "openblas_utest.h"
 
 CTEST(amax, samax){
@@ -617,6 +618,51 @@ CTEST(max, smax_zero){
   ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS);
 }
 
+
+CTEST(zscal, i_nan)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, i, &nan, 1);
+    ASSERT_TRUE(isnan(nan[0]));
+    ASSERT_TRUE(isnan(nan[1]));
+    ASSERT_TRUE(isnan(nan[16]));
+    ASSERT_TRUE(isnan(nan[17]));
+}
+
+CTEST(zscal, nan_i)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double nan[] = {NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0, NAN,0};
+    cblas_zscal(9, &nan, &i, 1);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isnan(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isnan(i[17]));
+	    }    
+	    
+CTEST(zscal, i_inf)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, i, &inf, 1);
+    ASSERT_TRUE(isnan(inf[0]));
+    ASSERT_TRUE(isinf(inf[1]));
+    ASSERT_TRUE(isnan(inf[16]));
+    ASSERT_TRUE(isinf(inf[17]));
+}    
+
+CTEST(zscal, inf_i)
+{
+    double i[] = {0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1, 0,1 };
+    double inf[] = {INFINITY, 0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0, INFINITY,0};
+    cblas_zscal(9, &inf, &i, 1);
+    ASSERT_TRUE(isnan(i[0]));
+    ASSERT_TRUE(isinf(i[1]));
+    ASSERT_TRUE(isnan(i[16]));
+    ASSERT_TRUE(isinf(i[17]));
+}
+
 int main(int argc, const char ** argv){
 
   CTEST_ADD (amax, samax);
@@ -648,7 +694,10 @@ int main(int argc, const char ** argv){
   CTEST_ADD (swap,zswap_inc_0);
   CTEST_ADD (swap,sswap_inc_0);
   CTEST_ADD (swap,cswap_inc_0);
-
+  CTEST_ADD (zscal, i_nan);
+  CTEST_ADD (zscal, nan_i);
+  CTEST_ADD (zscal, i_inf);
+  CTEST_ADD (zscal, inf_i);
   int num_fail=0;
 
   num_fail=ctest_main(argc, argv);